From 1fd8099895ac15df430caccbeae38574192dd8fe Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 7 May 2022 18:08:22 +0700 Subject: [PATCH 001/297] added train script but with prefix manually declared --- pretrain_mp3_gpt.py | 258 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 pretrain_mp3_gpt.py diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py new file mode 100644 index 000000000..cd1000631 --- /dev/null +++ b/pretrain_mp3_gpt.py @@ -0,0 +1,258 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain GPT""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu +from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + prefix_lm=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=True + ) + see_memory_usage(f"After Building Model", force=True) + return model + +_KEYS = ['text', 'prompt', 'answer'] + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + + import sys + sys.exit() + # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From fe3f8c27218d5f53d2588729e5e1f60acb79045b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 11:06:47 +0700 Subject: [PATCH 002/297] made new dataset --- megatron/data/non_causal_mtf_dataset.py | 527 ++++++++++++++++++++++++ pretrain_mp3_gpt.py | 12 +- train_mp3_gpt.sh | 115 ++++++ 3 files changed, 649 insertions(+), 5 deletions(-) create mode 100644 megatron/data/non_causal_mtf_dataset.py create mode 100644 train_mp3_gpt.sh diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py new file mode 100644 index 000000000..4c3bc8e50 --- /dev/null +++ b/megatron/data/non_causal_mtf_dataset.py @@ -0,0 +1,527 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GPT style dataset.""" + +import os +import time + +import numpy as np +import torch + +from megatron import mpu, print_rank_0 +from megatron.data.blendable_dataset import BlendableDataset +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset + + +def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + # Single dataset. + if len(data_prefix) == 1: + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup) + # Blending dataset. + else: + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, seed, skip_warmup) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + all_train_datasets = BlendableDataset(train_datasets, weights) \ + if train_datasets else None + all_valid_datasets = BlendableDataset(valid_datasets, weights) \ + if valid_datasets else None + all_test_datasets = BlendableDataset(test_datasets, weights) \ + if test_datasets else None + + return all_train_datasets, all_valid_datasets, all_test_datasets + + +def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, train_valid_test): + ''' + Build a single dataset group corresponding to Option 2 of data loading see arguments.py + a dataset group is passed on the following form + GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 + or alternatively + GIVEN_NAME PATH1 # for a single dataset to be used fully + ''' + + assert train_valid_test in ["train","valid","test"] + + # Single dataset. + if len(paths) == 1: + dataset = _build_single_datasets(paths[0], + splits[0], + data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, + dataset_group_name, train_valid_test) + return dataset + # Blending dataset. + else: + + data_prefix = [] + # data_prefix is on the shape: + # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] + for w,p in zip(weights, paths): + data_prefix += [w,p] + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + datasets = [] + for i in range(len(prefixes)): + ds = _build_single_datasets(prefixes[i], + splits[i], + data_impl, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, skip_warmup, + dataset_group_name, train_valid_test) + + datasets.append(ds) + all_datasets = BlendableDataset(datasets, weights) + + return all_datasets + +def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, + seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): + """Build a single dataset""" + + assert train_valid_test in ["train","valid","test"] + index = ["train","valid","test"].index(train_valid_test) + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # this corresponds to option2 for data loading on the form + # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 + # splits here is an array of size 2 [start_index, end_index] + splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + print_rank_0(' {}:'.format(dataset_group_name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[0], splits[1], + splits[1] - splits[0])) + + def build_dataset(name): + dataset = None + if splits[1] > splits[0]: + documents = np.arange(start=splits[0], stop=splits[1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + dataset = build_dataset(dataset_group_name) + + return dataset + + +def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset + + +class NonCausalMTFDataset(torch.utils.data.Dataset): + + def __init__( + self, + name, + data_prefix, + documents, + indexed_dataset, + num_samples, + seq_length, + seed + ): + + self.name = name + self.indexed_dataset = indexed_dataset + + # Checks + assert np.min(documents) >= 0 + assert np.max(documents) < indexed_dataset.sizes.shape[0] + + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + self.name, data_prefix, documents, self.indexed_dataset.sizes, + num_samples, seq_length, seed) + + def __len__(self): + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + return self.sample_idx.shape[0] - 1 + + def __getitem__(self, idx): + # Get the shuffled index. + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # If we are within the same document, just extract the chunk. + if doc_index_f == doc_index_l: + sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1) + else: + # Otherwise, get the rest of the initial document. + sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f)] + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + # And finally add the relevant portion of last document. + sample_list.append(self.indexed_dataset.get( + self.doc_idx[doc_index_l], + length=offset_l + 1)) + sample = np.concatenate(sample_list) + + return { + 'text': np.array(sample, dtype=np.int64), + 'prefix_len': 0 + } + + +def _build_index_mappings(name, data_prefix, documents, sizes, + num_samples, seq_length, seed, cutoff_last_epoch=0.95): + """Build doc-idx, sample-idx, and shuffle-idx. + doc-idx: is an array (ordered) of documents to be used in training. + sample-idx: is the start document index and document offset for each + training sample. + shuffle-idx: maps the sample index into a random index into sample-idx. + """ + # Number of tokens in each epoch and number of required epochs. + tokens_per_epoch = _num_tokens(documents, sizes) + num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) + # rng state + np_rng = np.random.RandomState(seed=seed) + + # Filename of the index mappings. + _filename = data_prefix + _filename += '_{}_indexmap'.format(name) + _filename += '_{}ns'.format(num_samples) + _filename += '_{}sl'.format(seq_length) + _filename += '_{}s'.format(seed) + doc_idx_filename = _filename + '_doc_idx.npy' + sample_idx_filename = _filename + '_sample_idx.npy' + shuffle_idx_filename = _filename + '_shuffle_idx.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0: + if (not os.path.isfile(doc_idx_filename)) or \ + (not os.path.isfile(sample_idx_filename)) or \ + (not os.path.isfile(shuffle_idx_filename)): + + print_rank_0(' > WARNING: could not find index map files, building ' + 'the indices on rank 0 ...') + + # For the last epoch, decide whether include the entire epoch + # in the global shuffle or not. + + # If we need only one epoch, then separating last epoch does + # not mean anything. + if num_epochs == 1: + separate_last_epoch = False + print(' > only one epoch required, setting ' + 'separate_last_epoch to False', flush=True) + + else: + # Get the number of samples for the last epoch + num_samples_from_epochs_minus_one = ( + (num_epochs - 1) * tokens_per_epoch - 1) // seq_length + last_epoch_num_samples = num_samples - \ + num_samples_from_epochs_minus_one + assert last_epoch_num_samples >= 0, \ + f'last epoch number of samples {last_epoch_num_samples} should be non-negative.' + num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length + assert last_epoch_num_samples <= num_samples_per_epoch, \ + f'last epoch number of samples {last_epoch_num_samples} exceeded max value {num_samples_per_epoch}.' + # If we have less than cutoff_last_epoch * samples_per_epoch of the samples for the last epoch, + # seperate out the epoch and treat it differently. + separate_last_epoch = (last_epoch_num_samples < + int(cutoff_last_epoch * num_samples_per_epoch)) + if separate_last_epoch: + string = ' > last epoch number of samples ({}) is smaller '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to True' + else: + string = ' > last epoch number of samples ({}) is larger '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to False' + print(string.format(last_epoch_num_samples, cutoff_last_epoch * 100, + num_samples_per_epoch), flush=True) + + # doc-idx. + start_time = time.time() + doc_idx = _build_doc_idx(documents, num_epochs, np_rng, + separate_last_epoch) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save doc-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # sample-idx. + start_time = time.time() + # Use C++ implementation for speed. + # First compile and then import. + from megatron.data import helpers + assert doc_idx.dtype == np.int32 + assert sizes.dtype == np.int32 + sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, + num_epochs, tokens_per_epoch) + # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, + # num_epochs, tokens_per_epoch) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save sample-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # shuffle-idx. + start_time = time.time() + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + if separate_last_epoch: + num_samples_ = num_samples_from_epochs_minus_one + else: + num_samples_ = sample_idx.shape[0] - 1 + shuffle_idx = _build_shuffle_idx(num_samples_, + sample_idx.shape[0] - 1, np_rng) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save shuffle-idx mapping' + ' (seconds): {:4f}'.format(time.time() - start_time)) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load mappings. + start_time = time.time() + print_rank_0(' > loading doc-idx mapping from {}'.format( + doc_idx_filename)) + doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading sample-idx mapping from {}'.format( + sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading shuffle-idx mapping from {}'.format( + shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + sample_idx.shape[0])) + print_rank_0(' total number of epochs: {}'.format(num_epochs)) + + return doc_idx, sample_idx, shuffle_idx + + +def _num_tokens(documents, sizes): + """Total number of tokens in the dataset.""" + return np.sum(sizes[documents]) + + +def _num_epochs(tokens_per_epoch, seq_length, num_samples): + """Based on number of samples and sequence lenght, calculate how many + epochs will be needed.""" + num_epochs = 0 + total_tokens = 0 + while True: + num_epochs += 1 + total_tokens += tokens_per_epoch + # -1 is because we need to retrieve seq_length + 1 token each time + # but the last token will overlap with the first token of the next + # sample except for the last sample. + if ((total_tokens - 1) // seq_length) >= num_samples: + return num_epochs + + +def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): + """Build an array with length = number-of-epochs * number-of-dcuments. + Each index is mapped to a corresponding document.""" + if not separate_last_epoch or num_epochs == 1: + doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx + + doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) + doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) + return np.concatenate((doc_idx_first, doc_idx_last)) + + +def _build_sample_idx(sizes, doc_idx, seq_length, + num_epochs, tokens_per_epoch): + """Sample index mapping is a 2D array with sizes + [number-of-samples + 1, 2] where [..., 0] contains + the index into `doc_idx` and [..., 1] is the + starting offset in that document.""" + + # Total number of samples. For -1 see comments in `_num_epochs`. + num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length + sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32) + + # Index into sample_idx. + sample_index = 0 + # Index into doc_idx. + doc_idx_index = 0 + # Begining offset for each document. + doc_offset = 0 + # Start with first document and no offset. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + while sample_index <= num_samples: + # Start with a fresh sequence. + remaining_seq_length = seq_length + 1 + while remaining_seq_length != 0: + # Get the document length. + doc_id = doc_idx[doc_idx_index] + doc_length = sizes[doc_id] - doc_offset + # And add it to the current sequence. + remaining_seq_length -= doc_length + # If we have more than a full sequence, adjust offset and set + # remaining length to zero so we return from the while loop. + # Note that -1 here is for the same reason we have -1 in + # `_num_epochs` calculations. + if remaining_seq_length <= 0: + doc_offset += (remaining_seq_length + doc_length - 1) + remaining_seq_length = 0 + else: + # Otherwise, start from the begining of the next document. + doc_idx_index += 1 + doc_offset = 0 + # Record the sequence. + sample_idx[sample_index][0] = doc_idx_index + sample_idx[sample_index][1] = doc_offset + sample_index += 1 + + return sample_idx + + +def _build_shuffle_idx(num_samples, total_size, np_rng): + """Build the range [0, size) and shuffle.""" + print(' > building shuffle index with split [0, {}) and [{}, {}) ' + '...'.format(num_samples, num_samples, total_size), flush=True) + + dtype_ = np.uint32 + if total_size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + + shuffle_idx_first = np.arange(start=0, stop=num_samples, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_first) + if num_samples == total_size: + return shuffle_idx_first + + shuffle_idx_last = np.arange(start=num_samples, stop=total_size, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_last) + + return np.concatenate((shuffle_idx_first, shuffle_idx_last)) diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index cd1000631..acf143573 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -22,7 +22,8 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +# from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ @@ -66,7 +67,7 @@ def model_provider(pre_process=True, post_process=True): see_memory_usage(f"After Building Model", force=True) return model -_KEYS = ['text', 'prompt', 'answer'] +_KEYS = ['text', 'prefix_len'] def get_batch(data_iterator): """Generate a batch""" @@ -90,7 +91,10 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + prefix_indices = data_b['prefix_len'].long() + print(prefix_indices) + import sys + sys.exit() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -192,8 +196,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): seed=args.seed, skip_warmup=(not args.mmap_warmup)) - import sys - sys.exit() # Option 2 of data loading using --(train|valid|test)-weighted-split-paths elif args.train_weighted_split_paths: assigned_train_valid_test = [] diff --git a/train_mp3_gpt.sh b/train_mp3_gpt.sh new file mode 100644 index 000000000..56a1c8767 --- /dev/null +++ b/train_mp3_gpt.sh @@ -0,0 +1,115 @@ +CHECKPOINT_PATH=checkpoints/gpt2 +VOCAB_FILE=data/gpt2-vocab.json +MERGE_FILE=data/gpt2-merges.txt +DATA_PATH=data/meg-gpt2_oscar-combined_text_document +TENSORBOARD_PATH=output_dir/tensorboard +CODECARBON_PATH=output_dir/codecarbon + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=16 +TP_SIZE=1 +PP_SIZE=1 + +N_GPUS=2 +SAVE_INTERVAL=100 + +# --train-samples 10_000 \ +# --exit-interval $EXIT_INTERVAL \ + +# --exit-interval 100 \ +GPT_ARGS=" \ + --num-layers 2 \ + --hidden-size 64 \ + --num-attention-heads 2 \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --rampup-batch-size 2 2 1_000 \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-samples 100 \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr 1e-4 \ + --lr-warmup-samples 5 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --fp16 \ + " +# --train-iters 500 \ + +OUTPUT_ARGS=" \ + --log-interval 10 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 100 \ + --eval-iters 10 \ + --checkpoint-activations \ + " + +# --codecarbon-dir $CODECARBON_PATH \ +DATA_ARGS=" \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + + +ZERO_STAGE=1 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS" + +# if you can't stand pt-1.9 launcher noise +export LOGLEVEL=WARNING + +LAUNCHER="deepspeed --num_gpus $N_GPUS" +export CMD=" \ + $LAUNCHER pretrain_mp3_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --distributed-backend nccl \ + $ALL_ARGS \ + " + +echo $CMD + +$CMD From 31f20877262e7f839e44b087fbacb6f85a6a03fc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 11:12:26 +0700 Subject: [PATCH 003/297] minor adjustments --- train_mp3_gpt.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/train_mp3_gpt.sh b/train_mp3_gpt.sh index 56a1c8767..0a9407a90 100644 --- a/train_mp3_gpt.sh +++ b/train_mp3_gpt.sh @@ -1,9 +1,9 @@ -CHECKPOINT_PATH=checkpoints/gpt2 +CHECKPOINT_PATH=data/checkpoints/gpt2 VOCAB_FILE=data/gpt2-vocab.json MERGE_FILE=data/gpt2-merges.txt -DATA_PATH=data/meg-gpt2_oscar-combined_text_document -TENSORBOARD_PATH=output_dir/tensorboard -CODECARBON_PATH=output_dir/codecarbon +DATA_PATH=data/t0-test +TENSORBOARD_PATH=data/checkpoints/tensorboard +CODECARBON_PATH=data/checkpoints/codecarbon MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=16 From 6ccacba4fbf5b7bb84c7dcdcbbd33d3115ecec72 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 9 May 2022 08:00:24 +0000 Subject: [PATCH 004/297] added capabilities for padding and prefix lm index --- megatron/data/non_causal_mtf_dataset.py | 59 +++++++++++++++---------- pretrain_mp3_gpt.py | 7 +-- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 4c3bc8e50..da8556c50 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -21,7 +21,7 @@ import numpy as np import torch -from megatron import mpu, print_rank_0 +from megatron import mpu, print_rank_0, get_tokenizer from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ @@ -239,6 +239,10 @@ def __init__( self.name = name self.indexed_dataset = indexed_dataset + self.seq_length = seq_length + + # vocab + self.tokenizer = get_tokenizer() # Checks assert np.min(documents) >= 0 @@ -257,32 +261,27 @@ def __len__(self): def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] - # Start and end documents and offsets. - doc_index_f = self.sample_idx[idx][0] - doc_index_l = self.sample_idx[idx + 1][0] - offset_f = self.sample_idx[idx][1] - offset_l = self.sample_idx[idx + 1][1] - # If we are within the same document, just extract the chunk. - if doc_index_f == doc_index_l: - sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f, - length=offset_l - offset_f + 1) + doc_idx = self.sample_idx[idx][0] + + sample = self.indexed_dataset.get( + self.doc_idx[doc_idx] + ) + + eod_idx = np.where(sample == self.tokenizer.eod)[0] + if len(eod_idx) > 0: + prefix_len = eod_idx[0] else: - # Otherwise, get the rest of the initial document. - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f)] - # Loop over all in between documents and add the entire document. - for i in range(doc_index_f + 1, doc_index_l): - sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - # And finally add the relevant portion of last document. - sample_list.append(self.indexed_dataset.get( - self.doc_idx[doc_index_l], - length=offset_l + 1)) - sample = np.concatenate(sample_list) + prefix_len = 0 + + sample = pad_and_convert_to_numpy( + sample, + self.tokenizer.pad, + self.seq_length + ) return { 'text': np.array(sample, dtype=np.int64), - 'prefix_len': 0 + 'prefix_len': prefix_len } @@ -525,3 +524,17 @@ def _build_shuffle_idx(num_samples, total_size, np_rng): np_rng.shuffle(shuffle_idx_last) return np.concatenate((shuffle_idx_first, shuffle_idx_last)) + +def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + + # Tokens and token types. + filler = np.array([pad_id] * padding_length) + tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + + return tokens_np diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index acf143573..8dccce361 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -22,7 +22,6 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -# from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain @@ -83,6 +82,7 @@ def get_batch(data_iterator): data = next(data_iterator) else: data = None + print(data) data_b = mpu.broadcast_data(keys, data, datatype) # Unpack. @@ -92,9 +92,6 @@ def get_batch(data_iterator): # Prefix prefix_indices = data_b['prefix_len'].long() - print(prefix_indices) - import sys - sys.exit() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -131,7 +128,7 @@ def get_batch_pipe(data): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = [len(seq) for seq in data_b['prompt'].long()] + prefix_indices = data_b['prefix_len'].long() # Get the masks and position ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( From 155a8ef65dcac9b9a8cec068b5bf9e4f661fdc60 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 9 May 2022 08:09:24 +0000 Subject: [PATCH 005/297] added finetune script --- examples/finetune_mp3.sh | 42 ++++++++++++++++++++++++++++++++++++++++ pretrain_mp3_gpt.py | 3 +-- 2 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 examples/finetune_mp3.sh diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh new file mode 100644 index 000000000..1e9919a86 --- /dev/null +++ b/examples/finetune_mp3.sh @@ -0,0 +1,42 @@ +#! /bin/bash + +# Runs the "345M" parameter model + +RANK=0 +WORLD_SIZE=1 + +DATA_PATH=data/t0-test_text_document +CHECKPOINT_PATH=data + + +deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ + --num-layers 2 \ + --hidden-size 128 \ + --num-attention-heads 4 \ + --micro-batch-size 4 \ + --global-batch-size 16 \ + --seq-length 512 \ + --max-position-embeddings 512 \ + --train-iters 10000 \ + --lr-decay-iters 5000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --checkpoint-activations \ + --log-interval 100 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --fp16 \ + --tensorboard-dir GPT2 diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index 8dccce361..4dd8a4160 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -82,7 +82,6 @@ def get_batch(data_iterator): data = next(data_iterator) else: data = None - print(data) data_b = mpu.broadcast_data(keys, data, datatype) # Unpack. @@ -91,7 +90,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].long() + prefix_indices = data_b['prefix_len'].cpu().tolist() #.long() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( From ec6c07e5e9551eae32ca0b36d16fa9369bdc73e9 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 9 May 2022 08:19:38 +0000 Subject: [PATCH 006/297] removed script --- train_mp3_gpt.sh | 115 ----------------------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 train_mp3_gpt.sh diff --git a/train_mp3_gpt.sh b/train_mp3_gpt.sh deleted file mode 100644 index 0a9407a90..000000000 --- a/train_mp3_gpt.sh +++ /dev/null @@ -1,115 +0,0 @@ -CHECKPOINT_PATH=data/checkpoints/gpt2 -VOCAB_FILE=data/gpt2-vocab.json -MERGE_FILE=data/gpt2-merges.txt -DATA_PATH=data/t0-test -TENSORBOARD_PATH=data/checkpoints/tensorboard -CODECARBON_PATH=data/checkpoints/codecarbon - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=16 -TP_SIZE=1 -PP_SIZE=1 - -N_GPUS=2 -SAVE_INTERVAL=100 - -# --train-samples 10_000 \ -# --exit-interval $EXIT_INTERVAL \ - -# --exit-interval 100 \ -GPT_ARGS=" \ - --num-layers 2 \ - --hidden-size 64 \ - --num-attention-heads 2 \ - --seq-length 1024 \ - --max-position-embeddings 1024 \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --rampup-batch-size 2 2 1_000 \ - --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-samples 100 \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.95 \ - --adam-eps 1e-8 \ - --lr 1e-4 \ - --lr-warmup-samples 5 \ - --clip-grad 1.0 \ - --weight-decay 1e-1 \ - --vocab-file $VOCAB_FILE \ - --merge-file $MERGE_FILE \ - --fp16 \ - " -# --train-iters 500 \ - -OUTPUT_ARGS=" \ - --log-interval 10 \ - --save-interval $SAVE_INTERVAL \ - --eval-interval 100 \ - --eval-iters 10 \ - --checkpoint-activations \ - " - -# --codecarbon-dir $CODECARBON_PATH \ -DATA_ARGS=" \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tensorboard-dir $TENSORBOARD_PATH \ - --tensorboard-queue-size 5 \ - --log-timers-to-tensorboard \ - --log-batch-size-to-tensorboard \ - --log-validation-ppl-to-tensorboard \ - " - - -ZERO_STAGE=1 - -config_json="./ds_config.json" - -# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() -cat < $config_json -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, - "train_batch_size": $GLOBAL_BATCH_SIZE, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": $ZERO_STAGE - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "steps_per_print": 2000, - "wall_clock_breakdown": false -} -EOT - - -DEEPSPEED_ARGS=" \ - --deepspeed \ - --deepspeed_config ${config_json} \ - --zero-stage ${ZERO_STAGE} \ - --deepspeed-activation-checkpointing \ - " - -ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS" - -# if you can't stand pt-1.9 launcher noise -export LOGLEVEL=WARNING - -LAUNCHER="deepspeed --num_gpus $N_GPUS" -export CMD=" \ - $LAUNCHER pretrain_mp3_gpt.py \ - --tensor-model-parallel-size $TP_SIZE \ - --pipeline-model-parallel-size $PP_SIZE \ - --distributed-backend nccl \ - $ALL_ARGS \ - " - -echo $CMD - -$CMD From 435d65ff59a29e4ce6eeb5a7fb4836c4fe2141e9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 22:59:26 +0700 Subject: [PATCH 007/297] added adjustments and new dataset --- megatron/data/non_causal_mlm_dataset.py | 165 ++++++++++++++++++++++++ megatron/data/non_causal_mtf_dataset.py | 52 +------- megatron/tokenizer/tokenizer.py | 19 +++ pretrain_mp3_gpt.py | 4 +- 4 files changed, 187 insertions(+), 53 deletions(-) create mode 100644 megatron/data/non_causal_mlm_dataset.py diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py new file mode 100644 index 000000000..d5f435d37 --- /dev/null +++ b/megatron/data/non_causal_mlm_dataset.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""T5 Style dataset.""" + +import collections + +import numpy as np +import torch + +from megatron import get_tokenizer +from megatron.data.dataset_utils import ( + create_masked_lm_predictions, + get_samples_mapping +) + +class NonCausalMLMDataset(torch.utils.data.Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, masked_lm_prob, + max_seq_length, + short_seq_prob, seed): + + # Params to store. + self.name = name + self.seed = seed + self.masked_lm_prob = masked_lm_prob + self.max_seq_length = max_seq_length + + # Dataset. + self.indexed_dataset = indexed_dataset + + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + + # Vocab stuff. + tokenizer = get_tokenizer() + self.vocab_id_list = list(tokenizer.inv_vocab.keys()) + self.vocab_id_to_token_dict = tokenizer.inv_vocab + self.cls_id = tokenizer.cls + self.sep_id = tokenizer.sep + self.mask_id = tokenizer.mask + self.pad_id = tokenizer.pad + self.bos_id = tokenizer.bos_token_id + self.eos_id = tokenizer.eos_token_id + self.sentinel_tokens = tokenizer.additional_special_tokens_ids + assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" + + def __len__(self): + return self.samples_mapping.shape[0] + + def __getitem__(self, idx): + + start_index, end_index, seq_length = self.samples_mapping[idx] + sample = [] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) + # Note that this rng state should be numpy and not python since + # python randint is inclusive whereas the numpy one is exclusive. + np_rng = np.random.RandomState(seed=(self.seed + idx)) + return build_training_sample(sample, + self.max_seq_length, # needed for padding + self.vocab_id_list, + self.vocab_id_to_token_dict, + self.cls_id, self.sep_id, + self.mask_id, self.pad_id, + self.masked_lm_prob, np_rng, + self.bos_id, self.eos_id, + self.sentinel_tokens) + + +def build_training_sample(sample, + max_seq_length, + vocab_id_list, vocab_id_to_token_dict, + cls_id, sep_id, mask_id, pad_id, + masked_lm_prob, np_rng, bos_id=None, + eos_id=None, sentinel_tokens=None): + """Build training sample. + + Arguments: + sample: A list of sentences in which each sentence is a list token ids. + max_seq_length: Maximum length of the sequence. All values are padded to + this length. + vocab_id_list: List of vocabulary ids. Used to pick a random id. + vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. + cls_id: Start of example id. + sep_id: Separator id. + mask_id: Mask token id. + pad_id: Padding token id. + masked_lm_prob: Probability to mask tokens. + np_rng: Random number genenrator. Note that this rng state should be + numpy and not python since python randint is inclusive for + the opper bound whereas the numpy one is exclusive. + bos_id: start of decoder example id + eos_id: end of generation id + sentinel_tokens: unique value to be substituted for every replaced span + """ + + # flatten sentences into one list + tokens = [token for sentence in sample for token in sentence] + + # Truncate to `target_sequence_length`. + max_num_tokens = max_seq_length + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] + + # Masking. + max_predictions_per_seq = masked_lm_prob * max_num_tokens + (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( + tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng + ) + + # Padding. + padded_tokens = pad_and_convert_to_numpy(tokens, max_seq_length) + padded_labels = pad_and_convert_to_numpy(labels, max_seq_length) + padded_masks = pad_and_convert_to_numpy(masks, max_seq_length) + + print(padded_tokens) + print(padded_labels) + import sys + sys.exit() + + train_sample = { + 'text': padded_tokens, + 'labels': padded_labels, + 'mask': padded_masks, + 'prefix_len': 0 + } + return train_sample + + +def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + + # Tokens and token types. + filler = np.array([pad_id] * padding_length) + tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + + return tokens_np \ No newline at end of file diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index da8556c50..95a005833 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -370,8 +370,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, assert sizes.dtype == np.int32 sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) - # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, - # num_epochs, tokens_per_epoch) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save sample-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) @@ -455,55 +454,6 @@ def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): return np.concatenate((doc_idx_first, doc_idx_last)) -def _build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch): - """Sample index mapping is a 2D array with sizes - [number-of-samples + 1, 2] where [..., 0] contains - the index into `doc_idx` and [..., 1] is the - starting offset in that document.""" - - # Total number of samples. For -1 see comments in `_num_epochs`. - num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length - sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32) - - # Index into sample_idx. - sample_index = 0 - # Index into doc_idx. - doc_idx_index = 0 - # Begining offset for each document. - doc_offset = 0 - # Start with first document and no offset. - sample_idx[sample_index][0] = doc_idx_index - sample_idx[sample_index][1] = doc_offset - sample_index += 1 - while sample_index <= num_samples: - # Start with a fresh sequence. - remaining_seq_length = seq_length + 1 - while remaining_seq_length != 0: - # Get the document length. - doc_id = doc_idx[doc_idx_index] - doc_length = sizes[doc_id] - doc_offset - # And add it to the current sequence. - remaining_seq_length -= doc_length - # If we have more than a full sequence, adjust offset and set - # remaining length to zero so we return from the while loop. - # Note that -1 here is for the same reason we have -1 in - # `_num_epochs` calculations. - if remaining_seq_length <= 0: - doc_offset += (remaining_seq_length + doc_length - 1) - remaining_seq_length = 0 - else: - # Otherwise, start from the begining of the next document. - doc_idx_index += 1 - doc_offset = 0 - # Record the sequence. - sample_idx[sample_index][0] = doc_idx_index - sample_idx[sample_index][1] = doc_offset - sample_index += 1 - - return sample_idx - - def _build_shuffle_idx(num_samples, total_size, np_rng): """Build the range [0, size) and shuffle.""" print(' > building shuffle index with split [0, {}) and [{}, {}) ' diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 0b8580b34..45ff9c5b6 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -331,6 +331,9 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} + if 'mask_token' not in self.tokenizer.special_tokens_map: + tokenizer.mask_token = "" + @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens @@ -353,6 +356,22 @@ def tokenize(self, text): def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) + @property + def cls(self): + return self.cls_id + + @property + def sep(self): + return self.sep_id + + @property + def pad(self): + return self.pad_id + + @property + def mask(self): + return self.mask_id + @property def eod(self): return self.tokenizer.eos_token_id diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index 4dd8a4160..4e61c184e 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -90,7 +90,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() #.long() + prefix_indices = data_b['prefix_len'].cpu().tolist() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -127,7 +127,7 @@ def get_batch_pipe(data): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].long() + prefix_indices = data_b['prefix_len'].cpu().tolist() # Get the masks and position ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( From b1d7bbde01de05943fbafb13c0c5d0702107c6ac Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:08:19 +0700 Subject: [PATCH 008/297] try mlm dataset --- megatron/data/non_causal_mlm_dataset.py | 210 +++++++++++++++++++++++- pretrain_mp3_gpt.py | 3 +- 2 files changed, 206 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d5f435d37..bb75d7367 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -13,18 +13,216 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""T5 Style dataset.""" +"""GPT Non-Causal Mask Language Model Finetune Style dataset.""" import collections import numpy as np import torch -from megatron import get_tokenizer -from megatron.data.dataset_utils import ( - create_masked_lm_predictions, - get_samples_mapping -) +from megatron import mpu, print_rank_0, get_tokenizer +from megatron.data.blendable_dataset import BlendableDataset +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.dataset_utils import create_masked_lm_predictions, get_samples_mapping +from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset + + +def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + # Single dataset. + if len(data_prefix) == 1: + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup) + # Blending dataset. + else: + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, seed, skip_warmup) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + all_train_datasets = BlendableDataset(train_datasets, weights) \ + if train_datasets else None + all_valid_datasets = BlendableDataset(valid_datasets, weights) \ + if valid_datasets else None + all_test_datasets = BlendableDataset(test_datasets, weights) \ + if test_datasets else None + + return all_train_datasets, all_valid_datasets, all_test_datasets + + +def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, train_valid_test): + ''' + Build a single dataset group corresponding to Option 2 of data loading see arguments.py + a dataset group is passed on the following form + GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 + or alternatively + GIVEN_NAME PATH1 # for a single dataset to be used fully + ''' + + assert train_valid_test in ["train","valid","test"] + + # Single dataset. + if len(paths) == 1: + dataset = _build_single_datasets(paths[0], + splits[0], + data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, + dataset_group_name, train_valid_test) + return dataset + # Blending dataset. + else: + + data_prefix = [] + # data_prefix is on the shape: + # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] + for w,p in zip(weights, paths): + data_prefix += [w,p] + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + datasets = [] + for i in range(len(prefixes)): + ds = _build_single_datasets(prefixes[i], + splits[i], + data_impl, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, skip_warmup, + dataset_group_name, train_valid_test) + + datasets.append(ds) + all_datasets = BlendableDataset(datasets, weights) + + return all_datasets + +def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, + seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): + """Build a single dataset""" + + assert train_valid_test in ["train","valid","test"] + index = ["train","valid","test"].index(train_valid_test) + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # this corresponds to option2 for data loading on the form + # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 + # splits here is an array of size 2 [start_index, end_index] + splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + print_rank_0(' {}:'.format(dataset_group_name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[0], splits[1], + splits[1] - splits[0])) + + def build_dataset(name): + dataset = None + if splits[1] > splits[0]: + documents = np.arange(start=splits[0], stop=splits[1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + dataset = build_dataset(dataset_group_name) + + return dataset + + +def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], + step=1, dtype=np.int32) + dataset = NonCausalMTFDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset + class NonCausalMLMDataset(torch.utils.data.Dataset): diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index 4e61c184e..b7af289a5 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -22,7 +22,8 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From 714e5b75f9a9d24d90b929f195d5e655073c7985 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:09:42 +0700 Subject: [PATCH 009/297] minor changes --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 45ff9c5b6..a74b84c1a 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -332,7 +332,7 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.decoder = {v: k for k, v in self.encoder.items()} if 'mask_token' not in self.tokenizer.special_tokens_map: - tokenizer.mask_token = "" + self.tokenizer.mask_token = "" @property def vocab_size(self): From dc1543638e7727a0458d0cad83aef22c8863f4d9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:10:51 +0700 Subject: [PATCH 010/297] minor addition of import packages --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index bb75d7367..82c9eb66b 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -15,6 +15,8 @@ """GPT Non-Causal Mask Language Model Finetune Style dataset.""" +import os +import time import collections import numpy as np From e79ac16fc6c36d0d51896a933a488d3c744333ac Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:12:04 +0700 Subject: [PATCH 011/297] minor error fix --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 82c9eb66b..c7a251f78 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -156,7 +156,7 @@ def build_dataset(name): if splits[1] > splits[0]: documents = np.arange(start=splits[0], stop=splits[1], step=1, dtype=np.int32) - dataset = NonCausalMTFDataset(name, data_prefix, + dataset = NonCausalMLMDataset(name, data_prefix, documents, indexed_dataset, train_valid_test_num_samples[index], seq_length, seed) From 9a90a2e6e28b68236a65908a8dfa67e1ae0726ea Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:14:07 +0700 Subject: [PATCH 012/297] minor error fix --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index c7a251f78..ceeceb559 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -198,7 +198,7 @@ def build_dataset(index, name): if splits[index + 1] > splits[index]: documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32) - dataset = NonCausalMTFDataset(name, data_prefix, + dataset = NonCausalMLMDataset(name, data_prefix, documents, indexed_dataset, train_valid_test_num_samples[index], seq_length, seed) From 7e79b487489351f14793bb343e599523880e8903 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:24:14 +0700 Subject: [PATCH 013/297] samples follow how gpt dataset is loaded --- megatron/data/non_causal_mlm_dataset.py | 68 +++++++++++++++++-------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ceeceb559..b4959e469 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -228,31 +228,26 @@ def get_indexed_dataset_(path, data_impl, skip_warmup): class NonCausalMLMDataset(torch.utils.data.Dataset): - def __init__(self, name, indexed_dataset, data_prefix, - num_epochs, max_num_samples, masked_lm_prob, - max_seq_length, - short_seq_prob, seed): + def __init__( + self, + name, + data_prefix, + documents, + indexed_dataset, + num_samples, + seq_length, + seed, + ): # Params to store. self.name = name + self.seq_length = seq_length self.seed = seed self.masked_lm_prob = masked_lm_prob - self.max_seq_length = max_seq_length # Dataset. self.indexed_dataset = indexed_dataset - # Build the samples mapping. - self.samples_mapping = get_samples_mapping(self.indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - self.max_seq_length - 2, # account for added tokens - short_seq_prob, - self.seed, - self.name, - False) - # Vocab stuff. tokenizer = get_tokenizer() self.vocab_id_list = list(tokenizer.inv_vocab.keys()) @@ -266,15 +261,46 @@ def __init__(self, name, indexed_dataset, data_prefix, self.sentinel_tokens = tokenizer.additional_special_tokens_ids assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" + # Checks + assert np.min(documents) >= 0 + assert np.max(documents) < indexed_dataset.sizes.shape[0] + + # Build index mappings. + self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + self.name, data_prefix, documents, self.indexed_dataset.sizes, + num_samples, seq_length, seed) + def __len__(self): - return self.samples_mapping.shape[0] + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + return self.sample_idx.shape[0] - 1 def __getitem__(self, idx): - start_index, end_index, seq_length = self.samples_mapping[idx] - sample = [] - for index in range(start_index, end_index): - sample.append(self.indexed_dataset[index]) + idx = self.shuffle_idx[idx] + # Start and end documents and offsets. + doc_index_f = self.sample_idx[idx][0] + doc_index_l = self.sample_idx[idx + 1][0] + offset_f = self.sample_idx[idx][1] + offset_l = self.sample_idx[idx + 1][1] + # If we are within the same document, just extract the chunk. + if doc_index_f == doc_index_l: + sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f, + length=offset_l - offset_f + 1) + else: + # Otherwise, get the rest of the initial document. + sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], + offset=offset_f)] + # Loop over all in between documents and add the entire document. + for i in range(doc_index_f + 1, doc_index_l): + sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + # And finally add the relevant portion of last document. + sample_list.append(self.indexed_dataset.get( + self.doc_idx[doc_index_l], + length=offset_l + 1)) + sample = np.concatenate(sample_list) + # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) From 3453dbdd1a55c21a79bd194d005a0e26676885b3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:26:03 +0700 Subject: [PATCH 014/297] added masked_lm_prob --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b4959e469..e3599be22 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -237,6 +237,7 @@ def __init__( num_samples, seq_length, seed, + masked_lm_prob=0.15, ): # Params to store. From d382d19ad24f25d319d2a3c60d3b867265f0dbd0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:51:48 +0700 Subject: [PATCH 015/297] added mask id --- megatron/tokenizer/tokenizer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index a74b84c1a..ff82dc67b 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -333,6 +333,15 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): if 'mask_token' not in self.tokenizer.special_tokens_map: self.tokenizer.mask_token = "" + self.tokenizer.mask_id = self.tokenizer.vocab_size-1 + + if 'cls_token' not in self.tokenizer.special_tokens_map: + self.tokenizer.cls_token = "" + self.tokenizer.cls_id = self.tokenizer.vocab_size-2 + + if 'sep_token' not in self.tokenizer.special_tokens_map: + self.tokenizer.sep_token = "" + self.tokenizer.sep_id = self.tokenizer.vocab_size-3 @property def vocab_size(self): From 31fbf550492c4c32c98c52fb5c465c42ae84431e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:53:37 +0700 Subject: [PATCH 016/297] added mask id --- megatron/tokenizer/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index ff82dc67b..1b4a6cc27 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -367,19 +367,19 @@ def detokenize(self, token_ids): @property def cls(self): - return self.cls_id + return self.cls_token_id @property def sep(self): - return self.sep_id + return self.sep_token_id @property def pad(self): - return self.pad_id + return self.pad_token_id @property def mask(self): - return self.mask_id + return self.mask_token_id @property def eod(self): From f57113270cb1459c78691063a376673d10574797 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 23:54:54 +0700 Subject: [PATCH 017/297] added mask id --- megatron/tokenizer/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 1b4a6cc27..5b9cdc01b 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -333,15 +333,15 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): if 'mask_token' not in self.tokenizer.special_tokens_map: self.tokenizer.mask_token = "" - self.tokenizer.mask_id = self.tokenizer.vocab_size-1 + self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 if 'cls_token' not in self.tokenizer.special_tokens_map: self.tokenizer.cls_token = "" - self.tokenizer.cls_id = self.tokenizer.vocab_size-2 + self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 if 'sep_token' not in self.tokenizer.special_tokens_map: self.tokenizer.sep_token = "" - self.tokenizer.sep_id = self.tokenizer.vocab_size-3 + self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 @property def vocab_size(self): From 5548a47164dac11487af4b85d089b882ebd9111a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:19:03 +0700 Subject: [PATCH 018/297] added mask id --- megatron/tokenizer/tokenizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 5b9cdc01b..c7efa6cfe 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -333,15 +333,15 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): if 'mask_token' not in self.tokenizer.special_tokens_map: self.tokenizer.mask_token = "" - self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 + # self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 if 'cls_token' not in self.tokenizer.special_tokens_map: self.tokenizer.cls_token = "" - self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 + # self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 if 'sep_token' not in self.tokenizer.special_tokens_map: self.tokenizer.sep_token = "" - self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 + # self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 @property def vocab_size(self): From 21237f3d8dfc0620a750cabb621b86357f512fa9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:20:20 +0700 Subject: [PATCH 019/297] added fix --- megatron/tokenizer/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index c7efa6cfe..9ef60fce1 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -367,19 +367,19 @@ def detokenize(self, token_ids): @property def cls(self): - return self.cls_token_id + return self.tokenizer.cls_token_id @property def sep(self): - return self.sep_token_id + return self.tokenizer.sep_token_id @property def pad(self): - return self.pad_token_id + return self.tokenizer.pad_token_id @property def mask(self): - return self.mask_token_id + return self.tokenizer.mask_token_id @property def eod(self): From 98c4635399d4aa8b6270ab040d7c0dd41160c25d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:23:02 +0700 Subject: [PATCH 020/297] added bos and eos token id --- megatron/tokenizer/tokenizer.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 9ef60fce1..01accbad3 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -381,6 +381,16 @@ def pad(self): def mask(self): return self.tokenizer.mask_token_id + @property + def bos_token_id(self): + """ Id of the beginning of sentence token in the vocabulary.""" + return self.tokenizer.bos_token_id + + @property + def eos_token_id(self): + """ Id of the end of sentence token in the vocabulary.""" + return self.tokenizer.eos_token_id + @property def eod(self): return self.tokenizer.eos_token_id From e1a75aa6a972e7eb6755cd0a54f36b0275d7d8c1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:25:13 +0700 Subject: [PATCH 021/297] no need for sentinal token --- megatron/data/non_causal_mlm_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e3599be22..4929f0dac 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,8 +259,6 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens_ids - assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" # Checks assert np.min(documents) >= 0 @@ -313,7 +311,7 @@ def __getitem__(self, idx): self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.bos_id, self.eos_id, - self.sentinel_tokens) + ) def build_training_sample(sample, From 2fdd7952b45275242993904b0c06638cf9385af0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:27:02 +0700 Subject: [PATCH 022/297] add aux functions --- megatron/data/non_causal_mlm_dataset.py | 191 ++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 4929f0dac..a5239f402 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -375,6 +375,197 @@ def build_training_sample(sample, return train_sample +def _build_index_mappings(name, data_prefix, documents, sizes, + num_samples, seq_length, seed, cutoff_last_epoch=0.95): + """Build doc-idx, sample-idx, and shuffle-idx. + doc-idx: is an array (ordered) of documents to be used in training. + sample-idx: is the start document index and document offset for each + training sample. + shuffle-idx: maps the sample index into a random index into sample-idx. + """ + # Number of tokens in each epoch and number of required epochs. + tokens_per_epoch = _num_tokens(documents, sizes) + num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) + # rng state + np_rng = np.random.RandomState(seed=seed) + + # Filename of the index mappings. + _filename = data_prefix + _filename += '_{}_indexmap'.format(name) + _filename += '_{}ns'.format(num_samples) + _filename += '_{}sl'.format(seq_length) + _filename += '_{}s'.format(seed) + doc_idx_filename = _filename + '_doc_idx.npy' + sample_idx_filename = _filename + '_sample_idx.npy' + shuffle_idx_filename = _filename + '_shuffle_idx.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0: + if (not os.path.isfile(doc_idx_filename)) or \ + (not os.path.isfile(sample_idx_filename)) or \ + (not os.path.isfile(shuffle_idx_filename)): + + print_rank_0(' > WARNING: could not find index map files, building ' + 'the indices on rank 0 ...') + + # For the last epoch, decide whether include the entire epoch + # in the global shuffle or not. + + # If we need only one epoch, then separating last epoch does + # not mean anything. + if num_epochs == 1: + separate_last_epoch = False + print(' > only one epoch required, setting ' + 'separate_last_epoch to False', flush=True) + + else: + # Get the number of samples for the last epoch + num_samples_from_epochs_minus_one = ( + (num_epochs - 1) * tokens_per_epoch - 1) // seq_length + last_epoch_num_samples = num_samples - \ + num_samples_from_epochs_minus_one + assert last_epoch_num_samples >= 0, \ + f'last epoch number of samples {last_epoch_num_samples} should be non-negative.' + num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length + assert last_epoch_num_samples <= num_samples_per_epoch, \ + f'last epoch number of samples {last_epoch_num_samples} exceeded max value {num_samples_per_epoch}.' + # If we have less than cutoff_last_epoch * samples_per_epoch of the samples for the last epoch, + # seperate out the epoch and treat it differently. + separate_last_epoch = (last_epoch_num_samples < + int(cutoff_last_epoch * num_samples_per_epoch)) + if separate_last_epoch: + string = ' > last epoch number of samples ({}) is smaller '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to True' + else: + string = ' > last epoch number of samples ({}) is larger '\ + 'than {}% of number of samples per epoch ({}), '\ + 'setting separate_last_epoch to False' + print(string.format(last_epoch_num_samples, cutoff_last_epoch * 100, + num_samples_per_epoch), flush=True) + + # doc-idx. + start_time = time.time() + doc_idx = _build_doc_idx(documents, num_epochs, np_rng, + separate_last_epoch) + np.save(doc_idx_filename, doc_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save doc-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # sample-idx. + start_time = time.time() + # Use C++ implementation for speed. + # First compile and then import. + from megatron.data import helpers + assert doc_idx.dtype == np.int32 + assert sizes.dtype == np.int32 + sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, + num_epochs, tokens_per_epoch) + + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save sample-idx mapping ' + '(seconds): {:4f}'.format(time.time() - start_time)) + # shuffle-idx. + start_time = time.time() + # -1 is due to data structure used to retieve the index: + # sample i --> [sample_idx[i], sample_idx[i+1]) + if separate_last_epoch: + num_samples_ = num_samples_from_epochs_minus_one + else: + num_samples_ = sample_idx.shape[0] - 1 + shuffle_idx = _build_shuffle_idx(num_samples_, + sample_idx.shape[0] - 1, np_rng) + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save shuffle-idx mapping' + ' (seconds): {:4f}'.format(time.time() - start_time)) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load mappings. + start_time = time.time() + print_rank_0(' > loading doc-idx mapping from {}'.format( + doc_idx_filename)) + doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading sample-idx mapping from {}'.format( + sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading shuffle-idx mapping from {}'.format( + shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + sample_idx.shape[0])) + print_rank_0(' total number of epochs: {}'.format(num_epochs)) + + return doc_idx, sample_idx, shuffle_idx + + +def _num_tokens(documents, sizes): + """Total number of tokens in the dataset.""" + return np.sum(sizes[documents]) + + +def _num_epochs(tokens_per_epoch, seq_length, num_samples): + """Based on number of samples and sequence lenght, calculate how many + epochs will be needed.""" + num_epochs = 0 + total_tokens = 0 + while True: + num_epochs += 1 + total_tokens += tokens_per_epoch + # -1 is because we need to retrieve seq_length + 1 token each time + # but the last token will overlap with the first token of the next + # sample except for the last sample. + if ((total_tokens - 1) // seq_length) >= num_samples: + return num_epochs + + +def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): + """Build an array with length = number-of-epochs * number-of-dcuments. + Each index is mapped to a corresponding document.""" + if not separate_last_epoch or num_epochs == 1: + doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx + + doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) + doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) + return np.concatenate((doc_idx_first, doc_idx_last)) + + +def _build_shuffle_idx(num_samples, total_size, np_rng): + """Build the range [0, size) and shuffle.""" + print(' > building shuffle index with split [0, {}) and [{}, {}) ' + '...'.format(num_samples, num_samples, total_size), flush=True) + + dtype_ = np.uint32 + if total_size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + + shuffle_idx_first = np.arange(start=0, stop=num_samples, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_first) + if num_samples == total_size: + return shuffle_idx_first + + shuffle_idx_last = np.arange(start=num_samples, stop=total_size, + step=1, dtype=dtype_) + np_rng.shuffle(shuffle_idx_last) + + return np.concatenate((shuffle_idx_first, shuffle_idx_last)) + + def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" From 154f39c1ee90ad091f851826807f591701c4b5d2 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:28:06 +0700 Subject: [PATCH 023/297] add aux functions --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a5239f402..d9da3b6a6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -304,7 +304,7 @@ def __getitem__(self, idx): # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) return build_training_sample(sample, - self.max_seq_length, # needed for padding + self.seq_length, # needed for padding self.vocab_id_list, self.vocab_id_to_token_dict, self.cls_id, self.sep_id, From 3765a81b3222b7711345c99b1d6e7f77aadc4840 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:29:39 +0700 Subject: [PATCH 024/297] add aux functions --- megatron/data/non_causal_mlm_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d9da3b6a6..0ab54df8a 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -342,7 +342,8 @@ def build_training_sample(sample, """ # flatten sentences into one list - tokens = [token for sentence in sample for token in sentence] + # tokens = [token for sentence in sample for token in sentence] + tokens = sample # Truncate to `target_sequence_length`. max_num_tokens = max_seq_length From 2cd417484144fda1687f077aaf371d71ff979916 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 10 May 2022 00:31:41 +0700 Subject: [PATCH 025/297] add pad_id --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0ab54df8a..aec33a23f 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -358,9 +358,9 @@ def build_training_sample(sample, ) # Padding. - padded_tokens = pad_and_convert_to_numpy(tokens, max_seq_length) - padded_labels = pad_and_convert_to_numpy(labels, max_seq_length) - padded_masks = pad_and_convert_to_numpy(masks, max_seq_length) + padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) + padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) + padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) print(padded_tokens) print(padded_labels) From b592ea33518f88c7b456ddcdfd41df66464476ad Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 18 May 2022 23:47:31 +0700 Subject: [PATCH 026/297] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 19 +++++++++++-------- megatron/data/non_causal_mtf_dataset.py | 7 +++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index aec33a23f..c27cfecf8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -354,16 +354,19 @@ def build_training_sample(sample, max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, - cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng - ) + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, + max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. - padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) - padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) - padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) - print(padded_tokens) - print(padded_labels) + # Padding. + # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) + # padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) + # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) + # print(padded_tokens) + # print(padded_labels) + + print(tokens) + print(labels) import sys sys.exit() diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 95a005833..6bce2c4ef 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""GPT style dataset.""" +"""GPT Non-Causal Multitask Finetune style dataset.""" import os import time @@ -237,10 +237,13 @@ def __init__( seed ): + # Params to store. self.name = name - self.indexed_dataset = indexed_dataset self.seq_length = seq_length + # Dataset. + self.indexed_dataset = indexed_dataset + # vocab self.tokenizer = get_tokenizer() From 852faca77c3fa6959321f17eb9ff72b63efd26dc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 18 May 2022 23:51:06 +0700 Subject: [PATCH 027/297] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index c27cfecf8..36456e819 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -365,7 +365,9 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) + print("tokens") print(tokens) + print("labels") print(labels) import sys sys.exit() From 4333554bb8aaa262549a25bc1a2ee17ca05e265f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 18 May 2022 23:57:28 +0700 Subject: [PATCH 028/297] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 36456e819..de47bd9ad 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -369,6 +369,8 @@ def build_training_sample(sample, print(tokens) print("labels") print(labels) + print("masked_spans") + print(masked_spans) import sys sys.exit() From be734550af688ecf0b7673f19425fc3407d8e6de Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 19 May 2022 00:00:11 +0700 Subject: [PATCH 029/297] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index de47bd9ad..434576ef3 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -367,6 +367,8 @@ def build_training_sample(sample, print("tokens") print(tokens) + print("masks") + print(masks) print("labels") print(labels) print("masked_spans") From 163c966e9bdd850b1190ae7d9d4b32a178a52405 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 19 May 2022 00:11:48 +0700 Subject: [PATCH 030/297] changed lm predictions to t5 --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 434576ef3..b21720618 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -365,6 +365,8 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) + print("sample") + print(sample) print("tokens") print(tokens) print("masks") From 56de89f509ebc2adf0165695cd7ed2d4e7a13317 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 19 May 2022 00:59:41 +0700 Subject: [PATCH 031/297] tokenizer add mask, cls, sep tokens --- megatron/tokenizer/tokenizer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 01accbad3..2cec3c797 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -332,16 +332,16 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.decoder = {v: k for k, v in self.encoder.items()} if 'mask_token' not in self.tokenizer.special_tokens_map: + self.tokenizer.add_tokens("") self.tokenizer.mask_token = "" - # self.tokenizer.mask_token_id = self.tokenizer.vocab_size-1 if 'cls_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.cls_token = "" - # self.tokenizer.cls_token_id = self.tokenizer.vocab_size-2 + self.tokenizer.add_tokens("") + self.tokenizer.mask_token = "" if 'sep_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.sep_token = "" - # self.tokenizer.sep_token_id = self.tokenizer.vocab_size-3 + self.tokenizer.add_tokens("") + self.tokenizer.mask_token = "" @property def vocab_size(self): From ca86fa81d2935065a12fa56d8e186bd8d7a682ac Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:08:10 +0700 Subject: [PATCH 032/297] commit latest changes --- megatron/data/non_causal_mlm_dataset.py | 35 +++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b21720618..970bd7c55 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,7 +259,7 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - + self.sentinel_tokens = tokenizer.additional_special_tokens_ids # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -311,6 +311,7 @@ def __getitem__(self, idx): self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.bos_id, self.eos_id, + self.sentinel_tokens ) @@ -357,7 +358,6 @@ def build_training_sample(sample, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) # padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) @@ -365,6 +365,33 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) + sentinel_tokens = collections.deque(sentinel_tokens) + t5_input = [] + (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() + + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_in.extend(span.label) + t5_decoder_out.append(flag) + t5_decoder_out.extend(span.label) + + end_index = span.index[0] + t5_input.extend(tokens[start_index: end_index]) + t5_input.append(flag) + + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 + + # Add token to the t5_decoder_out + t5_decoder_out.append(eos_id) + + # Add the remaining tokens to the t5 input + t5_input.extend(tokens[start_index:]) + + print("sample") print(sample) print("tokens") @@ -375,6 +402,10 @@ def build_training_sample(sample, print(labels) print("masked_spans") print(masked_spans) + for idx, spans in enumerate(masked_spans): + spans.index + sentinel_tokens + labels = spans.labels import sys sys.exit() From 5011d9935678a40caa07837918d17d963c514ffa Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:09:07 +0700 Subject: [PATCH 033/297] commit latest changes --- megatron/data/non_causal_mlm_dataset.py | 50 ++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 970bd7c55..dc5d36db5 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -365,31 +365,31 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) - sentinel_tokens = collections.deque(sentinel_tokens) - t5_input = [] - (t5_decoder_in, t5_decoder_out) = ([bos_id], []) - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) - t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) - t5_decoder_out.extend(span.label) - - end_index = span.index[0] - t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 - - # Add token to the t5_decoder_out - t5_decoder_out.append(eos_id) - - # Add the remaining tokens to the t5 input - t5_input.extend(tokens[start_index:]) + # sentinel_tokens = collections.deque(sentinel_tokens) + # t5_input = [] + # (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + # (start_index, end_index) = (0, None) + # for span in masked_spans: + # flag = sentinel_tokens.popleft() + + # # Append the same tokens in decoder input and output + # t5_decoder_in.append(flag) + # t5_decoder_in.extend(span.label) + # t5_decoder_out.append(flag) + # t5_decoder_out.extend(span.label) + + # end_index = span.index[0] + # t5_input.extend(tokens[start_index: end_index]) + # t5_input.append(flag) + + # # the next start index is the token after the last span token + # start_index = span.index[-1] + 1 + + # # Add token to the t5_decoder_out + # t5_decoder_out.append(eos_id) + + # # Add the remaining tokens to the t5 input + # t5_input.extend(tokens[start_index:]) print("sample") From 1b15263cddee63b0b693fece0ed75aedcc855b56 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:19:29 +0700 Subject: [PATCH 034/297] added sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 66 +++++++++++++++++-------- 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index dc5d36db5..8c5029fe6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -251,6 +251,32 @@ def __init__( # Vocab stuff. tokenizer = get_tokenizer() + tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + ] + }) + self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.cls @@ -365,31 +391,31 @@ def build_training_sample(sample, # print(padded_tokens) # print(padded_labels) - # sentinel_tokens = collections.deque(sentinel_tokens) - # t5_input = [] - # (t5_decoder_in, t5_decoder_out) = ([bos_id], []) - # (start_index, end_index) = (0, None) - # for span in masked_spans: - # flag = sentinel_tokens.popleft() + sentinel_tokens = collections.deque(sentinel_tokens) + t5_input = [] + (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() - # # Append the same tokens in decoder input and output - # t5_decoder_in.append(flag) - # t5_decoder_in.extend(span.label) - # t5_decoder_out.append(flag) - # t5_decoder_out.extend(span.label) + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_in.extend(span.label) + t5_decoder_out.append(flag) + t5_decoder_out.extend(span.label) - # end_index = span.index[0] - # t5_input.extend(tokens[start_index: end_index]) - # t5_input.append(flag) + end_index = span.index[0] + t5_input.extend(tokens[start_index: end_index]) + t5_input.append(flag) - # # the next start index is the token after the last span token - # start_index = span.index[-1] + 1 + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 - # # Add token to the t5_decoder_out - # t5_decoder_out.append(eos_id) + # Add token to the t5_decoder_out + t5_decoder_out.append(eos_id) - # # Add the remaining tokens to the t5 input - # t5_input.extend(tokens[start_index:]) + # Add the remaining tokens to the t5 input + t5_input.extend(tokens[start_index:]) print("sample") From 1b1654198cb692037c756f32a86e39ad41cbef92 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:23:32 +0700 Subject: [PATCH 035/297] added sentinal tokens --- megatron/tokenizer/tokenizer.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 2cec3c797..142af1b3f 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -343,6 +343,33 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.tokenizer.add_tokens("") self.tokenizer.mask_token = "" + self.tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + ] + }) + + @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens @@ -381,6 +408,11 @@ def pad(self): def mask(self): return self.tokenizer.mask_token_id + @property + def additional_special_tokens(self): + """ All the additional special tokens you may want to use (list of strings).""" + return self.tokenizer.additional_special_tokens + @property def bos_token_id(self): """ Id of the beginning of sentence token in the vocabulary.""" From 0603aac7401a2a681f5cd0b6a07e45bc7aae1192 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:23:59 +0700 Subject: [PATCH 036/297] added sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 26 ------------------------- 1 file changed, 26 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 8c5029fe6..970bd7c55 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -251,32 +251,6 @@ def __init__( # Vocab stuff. tokenizer = get_tokenizer() - tokenizer.add_special_tokens({ - 'additional_special_tokens': [ - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - ] - }) - self.vocab_id_list = list(tokenizer.inv_vocab.keys()) self.vocab_id_to_token_dict = tokenizer.inv_vocab self.cls_id = tokenizer.cls From bd061d3893bf88c0445100f452d19be2aff2317a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:25:33 +0700 Subject: [PATCH 037/297] added additional_special_tokens --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 970bd7c55..3616cab80 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,7 +259,7 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens_ids + self.sentinel_tokens = tokenizer.additional_special_tokens # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] From aff88b91c23e06d6d537b403645f5c5437b9a1fc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:26:53 +0700 Subject: [PATCH 038/297] added additional_special_tokens --- megatron/data/non_causal_mlm_dataset.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 3616cab80..1620c763b 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -392,20 +392,20 @@ def build_training_sample(sample, t5_input.extend(tokens[start_index:]) - print("sample") - print(sample) - print("tokens") - print(tokens) - print("masks") - print(masks) - print("labels") - print(labels) + # print("sample") + # print(sample) + # print("tokens") + # print(tokens) + # print("masks") + # print(masks) + # print("labels") + # print(labels) print("masked_spans") print(masked_spans) - for idx, spans in enumerate(masked_spans): - spans.index - sentinel_tokens - labels = spans.labels + # for idx, spans in enumerate(masked_spans): + # spans.index + # sentinel_tokens + # labels = spans.labels import sys sys.exit() From aab4729831986fec1a0eb6458d7218999d3c866b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 21:13:28 +0700 Subject: [PATCH 039/297] check t5_input and output --- megatron/data/non_causal_mlm_dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 1620c763b..83909d9a3 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -391,6 +391,10 @@ def build_training_sample(sample, # Add the remaining tokens to the t5 input t5_input.extend(tokens[start_index:]) + print("t5_input") + print(t5_input) + print("t5_decoder_out") + print(t5_decoder_out) # print("sample") # print(sample) From 9448ef48215b381916c8850ba09f05d790fca4c9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 21:59:13 +0700 Subject: [PATCH 040/297] check decoder in and decoder out --- megatron/data/non_causal_mlm_dataset.py | 22 +++++++++++----------- megatron/tokenizer/tokenizer.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 83909d9a3..98eb51a13 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -259,7 +259,7 @@ def __init__( self.pad_id = tokenizer.pad self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens + self.sentinel_tokens = tokenizer.additional_special_tokens_ids # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -366,7 +366,7 @@ def build_training_sample(sample, # print(padded_labels) sentinel_tokens = collections.deque(sentinel_tokens) - t5_input = [] + input_tokens_ids = [] (t5_decoder_in, t5_decoder_out) = ([bos_id], []) (start_index, end_index) = (0, None) for span in masked_spans: @@ -379,8 +379,8 @@ def build_training_sample(sample, t5_decoder_out.extend(span.label) end_index = span.index[0] - t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) + input_tokens_ids.extend(tokens[start_index: end_index]) + input_tokens_ids.append(flag) # the next start index is the token after the last span token start_index = span.index[-1] + 1 @@ -389,12 +389,14 @@ def build_training_sample(sample, t5_decoder_out.append(eos_id) # Add the remaining tokens to the t5 input - t5_input.extend(tokens[start_index:]) + input_tokens_ids.extend(tokens[start_index:]) - print("t5_input") - print(t5_input) + print("input_tokens_ids") + print(input_tokens_ids) print("t5_decoder_out") print(t5_decoder_out) + print("t5_decoder_in") + print(t5_decoder_in) # print("sample") # print(sample) @@ -404,8 +406,8 @@ def build_training_sample(sample, # print(masks) # print("labels") # print(labels) - print("masked_spans") - print(masked_spans) + # print("masked_spans") + # print(masked_spans) # for idx, spans in enumerate(masked_spans): # spans.index # sentinel_tokens @@ -415,8 +417,6 @@ def build_training_sample(sample, train_sample = { 'text': padded_tokens, - 'labels': padded_labels, - 'mask': padded_masks, 'prefix_len': 0 } return train_sample diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 142af1b3f..84522a824 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -409,9 +409,9 @@ def mask(self): return self.tokenizer.mask_token_id @property - def additional_special_tokens(self): + def additional_special_tokens_ids(self): """ All the additional special tokens you may want to use (list of strings).""" - return self.tokenizer.additional_special_tokens + return self.tokenizer.additional_special_tokens_ids @property def bos_token_id(self): From 16ba4aa2c78a5b0689bef1a2a9333b91c529e4db Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:07:14 +0700 Subject: [PATCH 041/297] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 48 +++++++++---------------- 1 file changed, 16 insertions(+), 32 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 98eb51a13..c9b5ecfc6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -367,16 +367,13 @@ def build_training_sample(sample, sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] - (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + output_tokens_ids = [] #[bos_id] (start_index, end_index) = (0, None) for span in masked_spans: flag = sentinel_tokens.popleft() - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) - t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) - t5_decoder_out.extend(span.label) + output_tokens_ids.append(flag) + output_tokens_ids.extend(span.label) end_index = span.index[0] input_tokens_ids.extend(tokens[start_index: end_index]) @@ -385,41 +382,28 @@ def build_training_sample(sample, # the next start index is the token after the last span token start_index = span.index[-1] + 1 - # Add token to the t5_decoder_out - t5_decoder_out.append(eos_id) - # Add the remaining tokens to the t5 input + # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) + # Add token to the output_tokens_ids + output_tokens_ids.append(eos_id) + prefix_len = len(input_tokens_ids) + text_tokens_ids = input_tokens_ids + output_tokens_ids + print("text_tokens_ids") + print(text_tokens_ids) print("input_tokens_ids") print(input_tokens_ids) - print("t5_decoder_out") - print(t5_decoder_out) - print("t5_decoder_in") - print(t5_decoder_in) - - # print("sample") - # print(sample) - # print("tokens") - # print(tokens) - # print("masks") - # print(masks) - # print("labels") - # print(labels) - # print("masked_spans") - # print(masked_spans) - # for idx, spans in enumerate(masked_spans): - # spans.index - # sentinel_tokens - # labels = spans.labels + print("output_tokens_ids") + print(output_tokens_ids) + import sys sys.exit() - train_sample = { - 'text': padded_tokens, - 'prefix_len': 0 + return { + 'text': input_tokens_ids, + 'prefix_len': prefix_len } - return train_sample def _build_index_mappings(name, data_prefix, documents, sizes, From 99ca9e80d120245072b4cdf6b1a6b1b0040e935c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:14:31 +0700 Subject: [PATCH 042/297] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index c9b5ecfc6..73f0e72c4 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -364,6 +364,8 @@ def build_training_sample(sample, # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) # print(padded_tokens) # print(padded_labels) + print("sentinel_tokens") + print(sentinel_tokens) sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] From cdfecad89facb61797eb2e926bc1e077e02ec33b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:17:44 +0700 Subject: [PATCH 043/297] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 73f0e72c4..a39c099f1 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -260,6 +260,9 @@ def __init__( self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids + + print("self.sentinel_tokens") + print(self.sentinel_tokens) # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -366,6 +369,8 @@ def build_training_sample(sample, # print(padded_labels) print("sentinel_tokens") print(sentinel_tokens) + import sys + sys.exit() sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] From e058688181243df32fd809431c40cef0e6d5c029 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:24:44 +0700 Subject: [PATCH 044/297] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a39c099f1..39e7105c8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -367,10 +367,6 @@ def build_training_sample(sample, # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) # print(padded_tokens) # print(padded_labels) - print("sentinel_tokens") - print(sentinel_tokens) - import sys - sys.exit() sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] @@ -398,11 +394,11 @@ def build_training_sample(sample, text_tokens_ids = input_tokens_ids + output_tokens_ids print("text_tokens_ids") - print(text_tokens_ids) + print(len(text_tokens_ids)) print("input_tokens_ids") - print(input_tokens_ids) + print(len(input_tokens_ids)) print("output_tokens_ids") - print(output_tokens_ids) + print(len(output_tokens_ids)) import sys sys.exit() From 38ded727b42765f8ce6ec7435c604ad460857e8b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:29:32 +0700 Subject: [PATCH 045/297] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 9 ++------- megatron/tokenizer/tokenizer.py | 4 ++++ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 39e7105c8..2b3ace71d 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -361,13 +361,6 @@ def build_training_sample(sample, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. - # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) - # padded_labels = pad_and_convert_to_numpy(labels, pad_id, max_seq_length) - # padded_masks = pad_and_convert_to_numpy(masks, pad_id, max_seq_length) - # print(padded_tokens) - # print(padded_labels) - sentinel_tokens = collections.deque(sentinel_tokens) input_tokens_ids = [] output_tokens_ids = [] #[bos_id] @@ -402,6 +395,8 @@ def build_training_sample(sample, import sys sys.exit() + # Padding. + # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) return { 'text': input_tokens_ids, diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 84522a824..953a63354 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -369,6 +369,10 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): ] }) + print(self.tokenizer.special_tokens_map) + import sys + sys.exit() + @property def vocab_size(self): From 0f68be34b4d31cbaedc601169283919289b75fda Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:30:50 +0700 Subject: [PATCH 046/297] made into input and output tokens --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 953a63354..2627f65b3 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -369,7 +369,7 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): ] }) - print(self.tokenizer.special_tokens_map) + print(self.tokenizer.additional_special_tokens_ids) import sys sys.exit() From eb84844dfc4d06f933e4a2cad01c0104597f6c02 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 22 May 2022 11:40:56 +0700 Subject: [PATCH 047/297] made into input and output tokens --- megatron/tokenizer/tokenizer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 2627f65b3..37ae7872b 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -331,17 +331,17 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} - if 'mask_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.add_tokens("") - self.tokenizer.mask_token = "" + # if 'mask_token' not in self.tokenizer.special_tokens_map: + # self.tokenizer.add_tokens("") + # self.tokenizer.mask_token = "" - if 'cls_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.add_tokens("") - self.tokenizer.mask_token = "" + # if 'cls_token' not in self.tokenizer.special_tokens_map: + # self.tokenizer.add_tokens("") + # self.tokenizer.mask_token = "" - if 'sep_token' not in self.tokenizer.special_tokens_map: - self.tokenizer.add_tokens("") - self.tokenizer.mask_token = "" + # if 'sep_token' not in self.tokenizer.special_tokens_map: + # self.tokenizer.add_tokens("") + # self.tokenizer.mask_token = "" self.tokenizer.add_special_tokens({ 'additional_special_tokens': [ From a3af6bf9c7ad186c09483d3e5eee09723ba2742c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:00:49 +0700 Subject: [PATCH 048/297] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 8 +++++++- megatron/tokenizer/tokenizer.py | 5 ----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 2b3ace71d..6c06dfcea 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -385,7 +385,7 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - text_tokens_ids = input_tokens_ids + output_tokens_ids + text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) print("text_tokens_ids") print(len(text_tokens_ids)) print("input_tokens_ids") @@ -393,6 +393,12 @@ def build_training_sample(sample, print("output_tokens_ids") print(len(output_tokens_ids)) + # input_tokens_ids = pad_and_convert_to_numpy( + # input_tokens_ids, + # self.tokenizer.pad, + # self.seq_length + # ) + import sys sys.exit() # Padding. diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 37ae7872b..461714fe3 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -369,11 +369,6 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): ] }) - print(self.tokenizer.additional_special_tokens_ids) - import sys - sys.exit() - - @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens From 6ad61b6dcef9e9ca58b6bd525ee090502bab4b63 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:02:18 +0700 Subject: [PATCH 049/297] made into input and output tokens --- megatron/data/non_causal_mlm_dataset.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 6c06dfcea..a61372cc0 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -261,8 +261,6 @@ def __init__( self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids - print("self.sentinel_tokens") - print(self.sentinel_tokens) # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] From 9131fdd7ae376d807ef7fe2f9e0b2488ad0f3dae Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:05:04 +0700 Subject: [PATCH 050/297] added eos --- megatron/tokenizer/tokenizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 461714fe3..ef3bfdf91 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -403,6 +403,10 @@ def sep(self): def pad(self): return self.tokenizer.pad_token_id + @property + def eod(self): + return self.tokenizer.eos_token_id + @property def mask(self): return self.tokenizer.mask_token_id From cb76cd311805097450b94e3a5c2f0376a58df154 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:06:34 +0700 Subject: [PATCH 051/297] added eos --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index ef3bfdf91..7a156115b 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -404,7 +404,7 @@ def pad(self): return self.tokenizer.pad_token_id @property - def eod(self): + def eod(self): return self.tokenizer.eos_token_id @property From 531ee688e1891055f724154cf152f62111e127d3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:09:21 +0700 Subject: [PATCH 052/297] test text_token --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a61372cc0..e5beec9ba 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -385,6 +385,7 @@ def build_training_sample(sample, text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) print("text_tokens_ids") + print(text_tokens_ids) print(len(text_tokens_ids)) print("input_tokens_ids") print(len(input_tokens_ids)) From a7d115835155f8936819a884c1787eb5cafa9b9b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:13:24 +0700 Subject: [PATCH 053/297] test text_token --- megatron/data/non_causal_mlm_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e5beec9ba..a691e3070 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -383,14 +383,14 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) - print("text_tokens_ids") - print(text_tokens_ids) - print(len(text_tokens_ids)) print("input_tokens_ids") print(len(input_tokens_ids)) print("output_tokens_ids") print(len(output_tokens_ids)) + text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) + print("text_tokens_ids") + print(text_tokens_ids) + print(len(text_tokens_ids)) # input_tokens_ids = pad_and_convert_to_numpy( # input_tokens_ids, From 0008cfb0d9a0de42a41b62fd1af275d59ffaedb1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:14:50 +0700 Subject: [PATCH 054/297] test text_token --- megatron/data/non_causal_mlm_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a691e3070..382e6a5c2 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -384,9 +384,9 @@ def build_training_sample(sample, prefix_len = len(input_tokens_ids) print("input_tokens_ids") - print(len(input_tokens_ids)) + print(input_tokens_ids) print("output_tokens_ids") - print(len(output_tokens_ids)) + print(output_tokens_ids) text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) print("text_tokens_ids") print(text_tokens_ids) From f1461a8352a6e0a35b58e9da436cc1cc6883aff2 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 13:16:09 +0700 Subject: [PATCH 055/297] test text_token --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 382e6a5c2..caad3319b 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -387,7 +387,7 @@ def build_training_sample(sample, print(input_tokens_ids) print("output_tokens_ids") print(output_tokens_ids) - text_tokens_ids = input_tokens_ids.extend(output_tokens_ids) + text_tokens_ids = input_tokens_ids+output_tokens_ids print("text_tokens_ids") print(text_tokens_ids) print(len(text_tokens_ids)) From ada0f1007d1b634d35a292547d3225c7368d0310 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 15:11:11 +0700 Subject: [PATCH 056/297] test text_token --- megatron/data/non_causal_mlm_dataset.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index caad3319b..f0cf81287 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -383,13 +383,13 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - print("input_tokens_ids") - print(input_tokens_ids) - print("output_tokens_ids") - print(output_tokens_ids) text_tokens_ids = input_tokens_ids+output_tokens_ids + # print("input_tokens_ids") + # print(input_tokens_ids) + # print("output_tokens_ids") + # print(output_tokens_ids) print("text_tokens_ids") - print(text_tokens_ids) + # print(text_tokens_ids) print(len(text_tokens_ids)) # input_tokens_ids = pad_and_convert_to_numpy( @@ -398,10 +398,10 @@ def build_training_sample(sample, # self.seq_length # ) + # Padding. + padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) import sys sys.exit() - # Padding. - # padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) return { 'text': input_tokens_ids, From 298c9b71fc7ca24224a6fcfd432ac6615bd3178f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:24:31 +0700 Subject: [PATCH 057/297] assigned array --- megatron/data/non_causal_mlm_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index f0cf81287..5236ef25f 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -383,7 +383,7 @@ def build_training_sample(sample, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - text_tokens_ids = input_tokens_ids+output_tokens_ids + text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) # print("input_tokens_ids") # print(input_tokens_ids) # print("output_tokens_ids") @@ -399,7 +399,7 @@ def build_training_sample(sample, # ) # Padding. - padded_tokens = pad_and_convert_to_numpy(tokens, pad_id, max_seq_length) + padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() From d2bdff6e46e9dd15145b51c0ac28d3625dbd4fa1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:29:41 +0700 Subject: [PATCH 058/297] assigned array --- examples/finetune_mp3.sh | 4 ++-- megatron/data/non_causal_mlm_dataset.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh index 1e9919a86..598f2acec 100644 --- a/examples/finetune_mp3.sh +++ b/examples/finetune_mp3.sh @@ -15,8 +15,8 @@ deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ --num-attention-heads 4 \ --micro-batch-size 4 \ --global-batch-size 16 \ - --seq-length 512 \ - --max-position-embeddings 512 \ + --seq-length 626 \ + --max-position-embeddings 1024 \ --train-iters 10000 \ --lr-decay-iters 5000 \ --save $CHECKPOINT_PATH \ diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 5236ef25f..e2fb731b8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -399,6 +399,7 @@ def build_training_sample(sample, # ) # Padding. + print(max_seq_length) padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() From 4ec8db3272b10fca7e8f57b7c2ac99c2bb854dcf Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:31:31 +0700 Subject: [PATCH 059/297] assigned array --- megatron/data/non_causal_mlm_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e2fb731b8..7c283ef8d 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -384,10 +384,10 @@ def build_training_sample(sample, prefix_len = len(input_tokens_ids) text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - # print("input_tokens_ids") - # print(input_tokens_ids) - # print("output_tokens_ids") - # print(output_tokens_ids) + print("input_tokens_ids") + print(len(input_tokens_ids)) + print("output_tokens_ids") + print(len(output_tokens_ids)) print("text_tokens_ids") # print(text_tokens_ids) print(len(text_tokens_ids)) From 10a2b6d1794f15b4de21b43e11cc5dc46bb99cbe Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 24 May 2022 17:33:42 +0700 Subject: [PATCH 060/297] hardcoded sequence length --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 7c283ef8d..0229f39cc 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -348,7 +348,7 @@ def build_training_sample(sample, tokens = sample # Truncate to `target_sequence_length`. - max_num_tokens = max_seq_length + max_num_tokens = 512 #max_seq_length truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] From a373a70032ecd63c746b2036e35468ab0403d555 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 22:20:13 +0700 Subject: [PATCH 061/297] check again --- megatron/data/non_causal_mlm_dataset.py | 3 +- megatron/tokenizer/tokenizer.py | 44 ++++++++++++------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0229f39cc..eae03d4ac 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -386,8 +386,10 @@ def build_training_sample(sample, text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) print("input_tokens_ids") print(len(input_tokens_ids)) + print(input_tokens_ids) print("output_tokens_ids") print(len(output_tokens_ids)) + print(output_tokens_ids) print("text_tokens_ids") # print(text_tokens_ids) print(len(text_tokens_ids)) @@ -399,7 +401,6 @@ def build_training_sample(sample, # ) # Padding. - print(max_seq_length) padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 7a156115b..8d458e5cd 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -17,7 +17,7 @@ from abc import ABC from abc import abstractmethod -from transformers import AutoTokenizer +from transformers import AutoTokenizer, AddedToken from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer @@ -345,27 +345,27 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.tokenizer.add_special_tokens({ 'additional_special_tokens': [ - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), ] }) From bdef71b0ba645a53ae209ace95bcfcafdf04584c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:09:31 +0700 Subject: [PATCH 062/297] show sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index eae03d4ac..09e6a70ef 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -348,10 +348,13 @@ def build_training_sample(sample, tokens = sample # Truncate to `target_sequence_length`. - max_num_tokens = 512 #max_seq_length + max_num_tokens = max_seq_length truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] + print(sentinel_tokens) + import sys + sys.exit() # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( @@ -379,6 +382,7 @@ def build_training_sample(sample, # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) + input_tokens_ids.append(eos_id) # Add token to the output_tokens_ids output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) From 262fd6ce54c7149f082071157bb6e2f48948bd02 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:10:52 +0700 Subject: [PATCH 063/297] show sentinal tokens --- examples/finetune_mp3.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh index 598f2acec..d813a9258 100644 --- a/examples/finetune_mp3.sh +++ b/examples/finetune_mp3.sh @@ -5,11 +5,11 @@ RANK=0 WORLD_SIZE=1 -DATA_PATH=data/t0-test_text_document +DATA_PATH=data/mc4-id_text_document CHECKPOINT_PATH=data -deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ +deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ --num-layers 2 \ --hidden-size 128 \ --num-attention-heads 4 \ @@ -23,7 +23,7 @@ deepspeed --num_gpus 2 pretrain_mp3_gpt.py \ --load $CHECKPOINT_PATH \ --data-path $DATA_PATH \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles \ + --tokenizer-name-or-path bigscience/tokenizer \ --data-impl mmap \ --split 949,50,1 \ --distributed-backend nccl \ From 68a6a936fcaf6957093c9acdee51d35825fd107b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:14:17 +0700 Subject: [PATCH 064/297] show sentinal tokens --- examples/finetune_mp3.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh index d813a9258..59cb34d4c 100644 --- a/examples/finetune_mp3.sh +++ b/examples/finetune_mp3.sh @@ -14,7 +14,7 @@ deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ --hidden-size 128 \ --num-attention-heads 4 \ --micro-batch-size 4 \ - --global-batch-size 16 \ + --global-batch-size 1024 \ --seq-length 626 \ --max-position-embeddings 1024 \ --train-iters 10000 \ @@ -39,4 +39,4 @@ deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 \ - --tensorboard-dir GPT2 + --tensorboard-dir LOG From 1c00d4bbf4ca11ce7ee1a2dfee5c660f79f439a5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:18:05 +0700 Subject: [PATCH 065/297] show sentinal tokens --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 09e6a70ef..578b53120 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -353,8 +353,7 @@ def build_training_sample(sample, tokens = tokens[:max_num_tokens] print(sentinel_tokens) - import sys - sys.exit() + # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( From 8b85f113788dd8199f20d22e50aebc81e04da58b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:25:10 +0700 Subject: [PATCH 066/297] add more special tokens --- megatron/data/non_causal_mlm_dataset.py | 2 -- megatron/tokenizer/tokenizer.py | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 578b53120..e00cad309 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -352,8 +352,6 @@ def build_training_sample(sample, truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] - print(sentinel_tokens) - # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 8d458e5cd..7079ad353 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -366,6 +366,10 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), ] }) From 85d204afadb4ba366e2e0f30c4bd2bebe03ae518 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:35:56 +0700 Subject: [PATCH 067/297] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 79 +++++++++++++++---------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e00cad309..5811f7fe7 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -249,6 +249,17 @@ def __init__( # Dataset. self.indexed_dataset = indexed_dataset + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + # Vocab stuff. tokenizer = get_tokenizer() self.vocab_id_list = list(tokenizer.inv_vocab.keys()) @@ -261,45 +272,51 @@ def __init__( self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids - # Checks - assert np.min(documents) >= 0 - assert np.max(documents) < indexed_dataset.sizes.shape[0] + # # Checks + # assert np.min(documents) >= 0 + # assert np.max(documents) < indexed_dataset.sizes.shape[0] - # Build index mappings. - self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( - self.name, data_prefix, documents, self.indexed_dataset.sizes, - num_samples, seq_length, seed) + # # Build index mappings. + # self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( + # self.name, data_prefix, documents, self.indexed_dataset.sizes, + # num_samples, seq_length, seed) def __len__(self): # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) - return self.sample_idx.shape[0] - 1 + # return self.sample_idx.shape[0] - 1 + return self.samples_mapping.shape[0] def __getitem__(self, idx): - idx = self.shuffle_idx[idx] - # Start and end documents and offsets. - doc_index_f = self.sample_idx[idx][0] - doc_index_l = self.sample_idx[idx + 1][0] - offset_f = self.sample_idx[idx][1] - offset_l = self.sample_idx[idx + 1][1] - # If we are within the same document, just extract the chunk. - if doc_index_f == doc_index_l: - sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f, - length=offset_l - offset_f + 1) - else: - # Otherwise, get the rest of the initial document. - sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - offset=offset_f)] - # Loop over all in between documents and add the entire document. - for i in range(doc_index_f + 1, doc_index_l): - sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - # And finally add the relevant portion of last document. - sample_list.append(self.indexed_dataset.get( - self.doc_idx[doc_index_l], - length=offset_l + 1)) - sample = np.concatenate(sample_list) + # idx = self.shuffle_idx[idx] + # # Start and end documents and offsets. + # doc_index_f = self.sample_idx[idx][0] + # doc_index_l = self.sample_idx[idx + 1][0] + # offset_f = self.sample_idx[idx][1] + # offset_l = self.sample_idx[idx + 1][1] + # # If we are within the same document, just extract the chunk. + # if doc_index_f == doc_index_l: + # sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], + # offset=offset_f, + # length=offset_l - offset_f + 1) + # else: + # # Otherwise, get the rest of the initial document. + # sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], + # offset=offset_f)] + # # Loop over all in between documents and add the entire document. + # for i in range(doc_index_f + 1, doc_index_l): + # sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) + # # And finally add the relevant portion of last document. + # sample_list.append(self.indexed_dataset.get( + # self.doc_idx[doc_index_l], + # length=offset_l + 1)) + # sample = np.concatenate(sample_list) + + start_index, end_index, seq_length = self.samples_mapping[idx] + sample = [] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. From 4c84274846c6eda143a9121192939e32b4d4555b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:51:36 +0700 Subject: [PATCH 068/297] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 5811f7fe7..d8dab0153 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -252,10 +252,10 @@ def __init__( # Build the samples mapping. self.samples_mapping = get_samples_mapping(self.indexed_dataset, data_prefix, - num_epochs, - max_num_samples, + False, #num_epochs, + num_samples, #max_num_samples, self.max_seq_length - 2, # account for added tokens - short_seq_prob, + 0.1, #short_seq_prob, self.seed, self.name, False) From 084245e59e4d76de9e80275c00b9af88f265da5f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 28 May 2022 23:53:54 +0700 Subject: [PATCH 069/297] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d8dab0153..b8b6dda16 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -254,7 +254,7 @@ def __init__( data_prefix, False, #num_epochs, num_samples, #max_num_samples, - self.max_seq_length - 2, # account for added tokens + self.seq_length-2, #self.max_seq_length - 2, # account for added tokens 0.1, #short_seq_prob, self.seed, self.name, From 32af10e876258b94256e01580c69791f12cea810 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:01:27 +0700 Subject: [PATCH 070/297] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b8b6dda16..eaa98d0f7 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,6 +317,8 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + # sample = np.concatenate(sample_list) + print(sample) # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. From b6e0e6364000a8cc0c1e3eecb207953f661c50ce Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:03:53 +0700 Subject: [PATCH 071/297] changed how mlm data is loaded --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index eaa98d0f7..d1839c2e6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,7 +317,7 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) - # sample = np.concatenate(sample_list) + sample = np.concatenate(sample) print(sample) # Note that this rng state should be numpy and not python since From 2af2e4b74dc83d50ae5deed867ca41cbfae78abe Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:16:31 +0700 Subject: [PATCH 072/297] added new script --- examples/train_non_causal_mlm_adaption.sh | 42 ++++ megatron/data/non_causal_mlm_dataset.py | 1 - train_non_causal_mlm_adaptation_gpt.py | 257 ++++++++++++++++++++++ 3 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 examples/train_non_causal_mlm_adaption.sh create mode 100644 train_non_causal_mlm_adaptation_gpt.py diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh new file mode 100644 index 000000000..b47ea7142 --- /dev/null +++ b/examples/train_non_causal_mlm_adaption.sh @@ -0,0 +1,42 @@ +#! /bin/bash + +# Runs the "345M" parameter model + +RANK=0 +WORLD_SIZE=1 + +DATA_PATH=data/mc4-id_text_document +CHECKPOINT_PATH=data + + +deepspeed --num_gpus 8 train_non_causal_mlm_adaption_mlm.py \ + --num-layers 2 \ + --hidden-size 128 \ + --num-attention-heads 4 \ + --micro-batch-size 4 \ + --global-batch-size 1024 \ + --seq-length 626 \ + --max-position-embeddings 1024 \ + --train-iters 10000 \ + --lr-decay-iters 5000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path bigscience/tokenizer \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --checkpoint-activations \ + --log-interval 100 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --fp16 \ + --tensorboard-dir LOG diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index d1839c2e6..fadfcb6fc 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -398,7 +398,6 @@ def build_training_sample(sample, # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) - input_tokens_ids.append(eos_id) # Add token to the output_tokens_ids output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py new file mode 100644 index 000000000..b7af289a5 --- /dev/null +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -0,0 +1,257 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain GPT""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu +# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + prefix_lm=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=True + ) + see_memory_usage(f"After Building Model", force=True) + return model + +_KEYS = ['text', 'prefix_len'] + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + + # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From cc5968e5ed05aaea343ae17776cdb12b3024c408 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:26:13 +0700 Subject: [PATCH 073/297] added new script --- megatron/data/non_causal_mlm_dataset.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index fadfcb6fc..4bcd41e53 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,8 +317,12 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + print(self.indexed_dataset[index]) + print(len(self.indexed_dataset[index])) sample = np.concatenate(sample) print(sample) + import sys + sys.exit() # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. From cf0b2a0f7359dc7dce9d54a6ca16a7c5a831d470 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:27:55 +0700 Subject: [PATCH 074/297] added new script --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 4bcd41e53..e6cab3c59 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -317,10 +317,9 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) - print(self.indexed_dataset[index]) print(len(self.indexed_dataset[index])) sample = np.concatenate(sample) - print(sample) + print(len(sample)) import sys sys.exit() From fc150a05a036a851ef1b04f3c3a103749a8446fe Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:50:32 +0700 Subject: [PATCH 075/297] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 654 ++++++------------------ train_non_causal_mlm_adaptation_gpt.py | 18 +- 2 files changed, 172 insertions(+), 500 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e6cab3c59..1b5d37250 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -15,236 +15,30 @@ """GPT Non-Causal Mask Language Model Finetune Style dataset.""" -import os -import time import collections import numpy as np import torch -from megatron import mpu, print_rank_0, get_tokenizer -from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples -from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ -from megatron.data.dataset_utils import create_masked_lm_predictions, get_samples_mapping -from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - - -def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup): - """Build train, valid, and test datasets.""" - - # Single dataset. - if len(data_prefix) == 1: - all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], - data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup) - # Blending dataset. - else: - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, seed, skip_warmup) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - all_train_datasets = BlendableDataset(train_datasets, weights) \ - if train_datasets else None - all_valid_datasets = BlendableDataset(valid_datasets, weights) \ - if valid_datasets else None - all_test_datasets = BlendableDataset(test_datasets, weights) \ - if test_datasets else None - - return all_train_datasets, all_valid_datasets, all_test_datasets - - -def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, train_valid_test): - ''' - Build a single dataset group corresponding to Option 2 of data loading see arguments.py - a dataset group is passed on the following form - GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 - or alternatively - GIVEN_NAME PATH1 # for a single dataset to be used fully - ''' - - assert train_valid_test in ["train","valid","test"] - - # Single dataset. - if len(paths) == 1: - dataset = _build_single_datasets(paths[0], - splits[0], - data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - dataset_group_name, train_valid_test) - return dataset - # Blending dataset. - else: - - data_prefix = [] - # data_prefix is on the shape: - # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] - for w,p in zip(weights, paths): - data_prefix += [w,p] - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - datasets = [] - for i in range(len(prefixes)): - ds = _build_single_datasets(prefixes[i], - splits[i], - data_impl, - datasets_train_valid_test_num_samples[i], - seq_length, - seed, skip_warmup, - dataset_group_name, train_valid_test) - - datasets.append(ds) - all_datasets = BlendableDataset(datasets, weights) - - return all_datasets - -def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, - seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): - """Build a single dataset""" - - assert train_valid_test in ["train","valid","test"] - index = ["train","valid","test"].index(train_valid_test) - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - # this corresponds to option2 for data loading on the form - # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 - # splits here is an array of size 2 [start_index, end_index] - splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) - - # Print stats about the splits. - print_rank_0(' > dataset split:') - - print_rank_0(' {}:'.format(dataset_group_name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[0], splits[1], - splits[1] - splits[0])) - - def build_dataset(name): - dataset = None - if splits[1] > splits[0]: - documents = np.arange(start=splits[0], stop=splits[1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) - return dataset - - dataset = build_dataset(dataset_group_name) - - return dataset - - -def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup): - """Build train, valid, and test datasets.""" - - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] - splits = get_train_valid_test_split_(splits_string, total_num_of_documents) - # Print stats about the splits. - print_rank_0(' > dataset split:') - - def print_split_stats(name, index): - print_rank_0(' {}:'.format(name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], - splits[index + 1] - splits[index])) - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) - - def build_dataset(index, name): - dataset = None - if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) - return dataset - - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') - - return (train_dataset, valid_dataset, test_dataset) - - -def get_indexed_dataset_(path, data_impl, skip_warmup): - """Build indexed dataset.""" - print_rank_0(' > building dataset index ...') - start_time = time.time() - indexed_dataset = make_indexed_dataset(path, - data_impl, - skip_warmup) - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - print_rank_0(' number of documents: {}'.format( - indexed_dataset.sizes.shape[0])) - - return indexed_dataset - - -class NonCausalMLMDataset(torch.utils.data.Dataset): - - def __init__( - self, - name, - data_prefix, - documents, - indexed_dataset, - num_samples, - seq_length, - seed, - masked_lm_prob=0.15, - ): +from megatron import get_tokenizer +from megatron.data.dataset_utils import ( + create_masked_lm_predictions, + get_samples_mapping +) + +class T5Dataset(torch.utils.data.Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, masked_lm_prob, + max_seq_length, max_seq_length_dec, + short_seq_prob, seed): # Params to store. self.name = name - self.seq_length = seq_length self.seed = seed self.masked_lm_prob = masked_lm_prob + self.max_seq_length = max_seq_length + self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset @@ -252,10 +46,10 @@ def __init__( # Build the samples mapping. self.samples_mapping = get_samples_mapping(self.indexed_dataset, data_prefix, - False, #num_epochs, - num_samples, #max_num_samples, - self.seq_length-2, #self.max_seq_length - 2, # account for added tokens - 0.1, #short_seq_prob, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, self.seed, self.name, False) @@ -271,75 +65,34 @@ def __init__( self.bos_id = tokenizer.bos_token_id self.eos_id = tokenizer.eos_token_id self.sentinel_tokens = tokenizer.additional_special_tokens_ids - - # # Checks - # assert np.min(documents) >= 0 - # assert np.max(documents) < indexed_dataset.sizes.shape[0] - - # # Build index mappings. - # self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( - # self.name, data_prefix, documents, self.indexed_dataset.sizes, - # num_samples, seq_length, seed) + assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" def __len__(self): - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - # return self.sample_idx.shape[0] - 1 return self.samples_mapping.shape[0] def __getitem__(self, idx): - # idx = self.shuffle_idx[idx] - # # Start and end documents and offsets. - # doc_index_f = self.sample_idx[idx][0] - # doc_index_l = self.sample_idx[idx + 1][0] - # offset_f = self.sample_idx[idx][1] - # offset_l = self.sample_idx[idx + 1][1] - # # If we are within the same document, just extract the chunk. - # if doc_index_f == doc_index_l: - # sample = self.indexed_dataset.get(self.doc_idx[doc_index_f], - # offset=offset_f, - # length=offset_l - offset_f + 1) - # else: - # # Otherwise, get the rest of the initial document. - # sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f], - # offset=offset_f)] - # # Loop over all in between documents and add the entire document. - # for i in range(doc_index_f + 1, doc_index_l): - # sample_list.append(self.indexed_dataset.get(self.doc_idx[i])) - # # And finally add the relevant portion of last document. - # sample_list.append(self.indexed_dataset.get( - # self.doc_idx[doc_index_l], - # length=offset_l + 1)) - # sample = np.concatenate(sample_list) - start_index, end_index, seq_length = self.samples_mapping[idx] sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) - print(len(self.indexed_dataset[index])) - sample = np.concatenate(sample) - print(len(sample)) - import sys - sys.exit() - # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) - return build_training_sample(sample, - self.seq_length, # needed for padding + return build_training_sample(sample, seq_length, + self.max_seq_length, # needed for padding + self.max_seq_length_dec, self.vocab_id_list, self.vocab_id_to_token_dict, self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.masked_lm_prob, np_rng, self.bos_id, self.eos_id, - self.sentinel_tokens - ) + self.sentinel_tokens) -def build_training_sample(sample, - max_seq_length, +def build_training_sample(sample, target_seq_length, + max_seq_length, max_seq_length_dec, vocab_id_list, vocab_id_to_token_dict, cls_id, sep_id, mask_id, pad_id, masked_lm_prob, np_rng, bos_id=None, @@ -348,6 +101,7 @@ def build_training_sample(sample, Arguments: sample: A list of sentences in which each sentence is a list token ids. + target_seq_length: Desired sequence length. max_seq_length: Maximum length of the sequence. All values are padded to this length. vocab_id_list: List of vocabulary ids. Used to pick a random id. @@ -365,47 +119,35 @@ def build_training_sample(sample, sentinel_tokens: unique value to be substituted for every replaced span """ + assert target_seq_length <= max_seq_length + # flatten sentences into one list - # tokens = [token for sentence in sample for token in sentence] - tokens = sample + tokens = [token for sentence in sample for token in sentence] # Truncate to `target_sequence_length`. - max_num_tokens = max_seq_length + max_num_tokens = target_seq_length truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] # Masking. max_predictions_per_seq = masked_lm_prob * max_num_tokens - (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( + (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - sentinel_tokens = collections.deque(sentinel_tokens) - input_tokens_ids = [] - output_tokens_ids = [] #[bos_id] - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - output_tokens_ids.append(flag) - output_tokens_ids.extend(span.label) - - end_index = span.index[0] - input_tokens_ids.extend(tokens[start_index: end_index]) - input_tokens_ids.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 + # Padding. + input_tokens_ids, output_tokens_ids, labels, enc_mask, \ + dec_mask, enc_dec_mask, loss_mask \ + = pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, max_seq_length, + max_seq_length_dec, masked_spans, + bos_id, eos_id, sentinel_tokens) + text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - # Add the remaining tokens to input_tokens_ids - input_tokens_ids.extend(tokens[start_index:]) - # Add token to the output_tokens_ids - output_tokens_ids.append(eos_id) - prefix_len = len(input_tokens_ids) - text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) + text_tokens_ids = input_tokens_ids + output_tokens_ids print("input_tokens_ids") print(len(input_tokens_ids)) print(input_tokens_ids) @@ -422,8 +164,6 @@ def build_training_sample(sample, # self.seq_length # ) - # Padding. - padded_tokens = pad_and_convert_to_numpy(text_tokens_ids, pad_id, max_seq_length) import sys sys.exit() @@ -433,207 +173,127 @@ def build_training_sample(sample, } -def _build_index_mappings(name, data_prefix, documents, sizes, - num_samples, seq_length, seed, cutoff_last_epoch=0.95): - """Build doc-idx, sample-idx, and shuffle-idx. - doc-idx: is an array (ordered) of documents to be used in training. - sample-idx: is the start document index and document offset for each - training sample. - shuffle-idx: maps the sample index into a random index into sample-idx. - """ - # Number of tokens in each epoch and number of required epochs. - tokens_per_epoch = _num_tokens(documents, sizes) - num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) - # rng state - np_rng = np.random.RandomState(seed=seed) - - # Filename of the index mappings. - _filename = data_prefix - _filename += '_{}_indexmap'.format(name) - _filename += '_{}ns'.format(num_samples) - _filename += '_{}sl'.format(seq_length) - _filename += '_{}s'.format(seed) - doc_idx_filename = _filename + '_doc_idx.npy' - sample_idx_filename = _filename + '_sample_idx.npy' - shuffle_idx_filename = _filename + '_shuffle_idx.npy' - - # Build the indexed mapping if not exist. - if torch.distributed.get_rank() == 0: - if (not os.path.isfile(doc_idx_filename)) or \ - (not os.path.isfile(sample_idx_filename)) or \ - (not os.path.isfile(shuffle_idx_filename)): - - print_rank_0(' > WARNING: could not find index map files, building ' - 'the indices on rank 0 ...') - - # For the last epoch, decide whether include the entire epoch - # in the global shuffle or not. - - # If we need only one epoch, then separating last epoch does - # not mean anything. - if num_epochs == 1: - separate_last_epoch = False - print(' > only one epoch required, setting ' - 'separate_last_epoch to False', flush=True) - - else: - # Get the number of samples for the last epoch - num_samples_from_epochs_minus_one = ( - (num_epochs - 1) * tokens_per_epoch - 1) // seq_length - last_epoch_num_samples = num_samples - \ - num_samples_from_epochs_minus_one - assert last_epoch_num_samples >= 0, \ - f'last epoch number of samples {last_epoch_num_samples} should be non-negative.' - num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length - assert last_epoch_num_samples <= num_samples_per_epoch, \ - f'last epoch number of samples {last_epoch_num_samples} exceeded max value {num_samples_per_epoch}.' - # If we have less than cutoff_last_epoch * samples_per_epoch of the samples for the last epoch, - # seperate out the epoch and treat it differently. - separate_last_epoch = (last_epoch_num_samples < - int(cutoff_last_epoch * num_samples_per_epoch)) - if separate_last_epoch: - string = ' > last epoch number of samples ({}) is smaller '\ - 'than {}% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to True' - else: - string = ' > last epoch number of samples ({}) is larger '\ - 'than {}% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to False' - print(string.format(last_epoch_num_samples, cutoff_last_epoch * 100, - num_samples_per_epoch), flush=True) - - # doc-idx. - start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng, - separate_last_epoch) - np.save(doc_idx_filename, doc_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save doc-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) - # sample-idx. - start_time = time.time() - # Use C++ implementation for speed. - # First compile and then import. - from megatron.data import helpers - assert doc_idx.dtype == np.int32 - assert sizes.dtype == np.int32 - sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch) - - np.save(sample_idx_filename, sample_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save sample-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) - # shuffle-idx. - start_time = time.time() - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - if separate_last_epoch: - num_samples_ = num_samples_from_epochs_minus_one - else: - num_samples_ = sample_idx.shape[0] - 1 - shuffle_idx = _build_shuffle_idx(num_samples_, - sample_idx.shape[0] - 1, np_rng) - np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save shuffle-idx mapping' - ' (seconds): {:4f}'.format(time.time() - start_time)) - - # This should be a barrier but nccl barrier assumes - # device_index=rank which is not the case for model - # parallel case - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) - - # Load mappings. - start_time = time.time() - print_rank_0(' > loading doc-idx mapping from {}'.format( - doc_idx_filename)) - doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' > loading sample-idx mapping from {}'.format( - sample_idx_filename)) - sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' > loading shuffle-idx mapping from {}'.format( - shuffle_idx_filename)) - shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( - time.time() - start_time)) - print_rank_0(' total number of samples: {}'.format( - sample_idx.shape[0])) - print_rank_0(' total number of epochs: {}'.format(num_epochs)) - - return doc_idx, sample_idx, shuffle_idx - - -def _num_tokens(documents, sizes): - """Total number of tokens in the dataset.""" - return np.sum(sizes[documents]) - - -def _num_epochs(tokens_per_epoch, seq_length, num_samples): - """Based on number of samples and sequence lenght, calculate how many - epochs will be needed.""" - num_epochs = 0 - total_tokens = 0 - while True: - num_epochs += 1 - total_tokens += tokens_per_epoch - # -1 is because we need to retrieve seq_length + 1 token each time - # but the last token will overlap with the first token of the next - # sample except for the last sample. - if ((total_tokens - 1) // seq_length) >= num_samples: - return num_epochs - - -def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): - """Build an array with length = number-of-epochs * number-of-dcuments. - Each index is mapped to a corresponding document.""" - if not separate_last_epoch or num_epochs == 1: - doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] - doc_idx[:] = documents - doc_idx = doc_idx.reshape(-1) - doc_idx = doc_idx.astype(np.int32) - np_rng.shuffle(doc_idx) - return doc_idx - - doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) - doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) - return np.concatenate((doc_idx_first, doc_idx_last)) - - -def _build_shuffle_idx(num_samples, total_size, np_rng): - """Build the range [0, size) and shuffle.""" - print(' > building shuffle index with split [0, {}) and [{}, {}) ' - '...'.format(num_samples, num_samples, total_size), flush=True) - - dtype_ = np.uint32 - if total_size >= (np.iinfo(np.uint32).max - 1): - dtype_ = np.int64 - - shuffle_idx_first = np.arange(start=0, stop=num_samples, - step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_first) - if num_samples == total_size: - return shuffle_idx_first - - shuffle_idx_last = np.arange(start=num_samples, stop=total_size, - step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_last) - - return np.concatenate((shuffle_idx_first, shuffle_idx_last)) - - -def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): + # train_sample = { + # 'text_enc': tokens_enc, + # 'text_dec': tokens_dec_in, + # 'labels': labels, + # 'loss_mask': loss_mask, + # 'truncated': int(truncated), + # 'enc_mask': enc_mask, + # 'dec_mask': dec_mask, + # 'enc_dec_mask': enc_dec_mask, + # } + # return train_sample + + +def pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, + max_seq_length, max_seq_length_dec, + masked_spans=None, bos_id=None, + eos_id=None, sentinel_tokens=None): """Pad sequences and convert them to numpy.""" + sentinel_tokens = collections.deque(sentinel_tokens) + t5_input = [] + (t5_decoder_in, t5_decoder_out) = ([bos_id], []) + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() + + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_in.extend(span.label) + t5_decoder_out.append(flag) + t5_decoder_out.extend(span.label) + + end_index = span.index[0] + t5_input.extend(tokens[start_index: end_index]) + t5_input.append(flag) + + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 + + # Add token to the t5_decoder_out + t5_decoder_out.append(eos_id) + + # Add the remaining tokens to the t5 input + t5_input.extend(tokens[start_index:]) + + # assert (len(t5_input) - len(masked_spans)) + \ + # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) + # Some checks. - num_tokens = len(tokens) + + # Encoder-side padding mask. + num_tokens = len(t5_input) padding_length = max_seq_length - num_tokens assert padding_length >= 0 + assert len(masked_positions) == len(masked_labels) + + # Tokens.. + filler = [pad_id] * padding_length + tokens_enc = np.array(t5_input + filler, dtype=np.int64) - # Tokens and token types. - filler = np.array([pad_id] * padding_length) - tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + # Decoder-side padding mask. + num_tokens_dec = len(t5_decoder_in) + padding_length_dec = max_seq_length_dec - num_tokens_dec + assert padding_length_dec >= 0 + filler_dec = [pad_id] * padding_length_dec + tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) - return tokens_np \ No newline at end of file + # Create attention masks + enc_mask = make_attention_mask(tokens_enc, tokens_enc) + enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) + dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) + dec_mask = dec_mask * make_history_mask(tokens_dec_in) + + # Labels mask. + labels = t5_decoder_out + ([-1] * padding_length_dec) + labels = np.array(labels, dtype=np.int64) + + # Loss mask + loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) + loss_mask = np.array(loss_mask, dtype=np.int64) + + return tokens_enc, tokens_dec_in, labels, enc_mask, \ + dec_mask, enc_dec_mask, loss_mask + + +def make_attention_mask(source_block, target_block): + """ + Returns a 2-dimensional (2-D) attention mask + :param source_block: 1-D array + :param target_block: 1-D array + """ + mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) + mask = mask.astype(np.int64) + # (source_length, target_length) + return mask + + +def make_attention_mask_3d(source_block, target_block): + """ + Returns a 3-dimensional (3-D) attention mask + :param source_block: 1-D array + :param target_block: 1-D array + """ + mask = (target_block[:, None, :] >= 1) * (source_block[:, :, None] >= 1) + # (batch, source_length, target_length) + # mask = mask.astype(np.int64) + return mask + + +def make_history_mask(block): + length = block.shape[0] + arange = np.arange(length) + history_mask = (arange[None, ] <= arange[:, None]) + history_mask = history_mask.astype(np.int64) + return history_mask + + +def make_history_mask_3d(block): + batch, length = block.shape + arange = torch.arange(length, device=block.device) + history_mask = (arange[None, ] <= arange[:, None])[None, ] + history_mask = history_mask.expand(batch, length, length) + return history_mask diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index b7af289a5..3a12e0b95 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -22,7 +22,7 @@ from megatron import get_timers from megatron import get_tokenizer from megatron import mpu -# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group + from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain @@ -184,14 +184,26 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: + # train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + # data_prefix=args.data_path, + # data_impl=args.data_impl, + # splits_string=args.split, + # train_valid_test_num_samples=train_val_test_num_samples, + # seq_length=args.seq_length, + # seed=args.seed, + # skip_warmup=(not args.mmap_warmup)) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, + max_seq_length=512,#args.encoder_seq_length, + max_seq_length_dec=114,#args.decoder_seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, seed=args.seed, - skip_warmup=(not args.mmap_warmup)) + skip_warmup=(not args.mmap_warmup), + dataset_type='t5') # Option 2 of data loading using --(train|valid|test)-weighted-split-paths elif args.train_weighted_split_paths: From 039f90f210aa4c2527ab352a696200aa9931241d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:51:46 +0700 Subject: [PATCH 076/297] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 1b5d37250..43b25544a 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -144,8 +144,7 @@ def build_training_sample(sample, target_seq_length, max_seq_length_dec, masked_spans, bos_id, eos_id, sentinel_tokens) - text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - + #text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) text_tokens_ids = input_tokens_ids + output_tokens_ids print("input_tokens_ids") From 7364781eb27cea97ad71d2be23576c291919835e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 00:56:19 +0700 Subject: [PATCH 077/297] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 212 +++++++++++++++++++++++- 1 file changed, 205 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 43b25544a..6425c8b72 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -15,18 +15,216 @@ """GPT Non-Causal Mask Language Model Finetune Style dataset.""" +import os +import time import collections import numpy as np import torch -from megatron import get_tokenizer -from megatron.data.dataset_utils import ( - create_masked_lm_predictions, - get_samples_mapping -) - -class T5Dataset(torch.utils.data.Dataset): +from megatron import mpu, print_rank_0, get_tokenizer +from megatron.data.blendable_dataset import BlendableDataset +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_samples_mapping, create_masked_lm_predictions +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset + + +def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + # Single dataset. + if len(data_prefix) == 1: + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup) + # Blending dataset. + else: + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + seq_length, seed, skip_warmup) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + all_train_datasets = BlendableDataset(train_datasets, weights) \ + if train_datasets else None + all_valid_datasets = BlendableDataset(valid_datasets, weights) \ + if valid_datasets else None + all_test_datasets = BlendableDataset(test_datasets, weights) \ + if test_datasets else None + + return all_train_datasets, all_valid_datasets, all_test_datasets + + +def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, train_valid_test): + ''' + Build a single dataset group corresponding to Option 2 of data loading see arguments.py + a dataset group is passed on the following form + GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 + or alternatively + GIVEN_NAME PATH1 # for a single dataset to be used fully + ''' + + assert train_valid_test in ["train","valid","test"] + + # Single dataset. + if len(paths) == 1: + dataset = _build_single_datasets(paths[0], + splits[0], + data_impl, + train_valid_test_num_samples, + seq_length, seed, skip_warmup, + dataset_group_name, train_valid_test) + return dataset + # Blending dataset. + else: + + data_prefix = [] + # data_prefix is on the shape: + # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] + for w,p in zip(weights, paths): + data_prefix += [w,p] + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + datasets = [] + for i in range(len(prefixes)): + ds = _build_single_datasets(prefixes[i], + splits[i], + data_impl, + datasets_train_valid_test_num_samples[i], + seq_length, + seed, skip_warmup, + dataset_group_name, train_valid_test) + + datasets.append(ds) + all_datasets = BlendableDataset(datasets, weights) + + return all_datasets + +def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, + seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): + """Build a single dataset""" + + assert train_valid_test in ["train","valid","test"] + index = ["train","valid","test"].index(train_valid_test) + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # this corresponds to option2 for data loading on the form + # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 + # splits here is an array of size 2 [start_index, end_index] + splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + print_rank_0(' {}:'.format(dataset_group_name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[0], splits[1], + splits[1] - splits[0])) + + def build_dataset(name): + dataset = None + if splits[1] > splits[0]: + documents = np.arange(start=splits[0], stop=splits[1], + step=1, dtype=np.int32) + dataset = NonCausalMLMDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + dataset = build_dataset(dataset_group_name) + + return dataset + + +def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, + train_valid_test_num_samples, + seq_length, seed, skip_warmup): + """Build train, valid, and test datasets.""" + + + # Indexed dataset. + indexed_dataset = get_indexed_dataset_(data_prefix, + data_impl, + skip_warmup) + + total_num_of_documents = indexed_dataset.sizes.shape[0] + # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], + step=1, dtype=np.int32) + dataset = NonCausalMLMDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset + +class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, num_epochs, max_num_samples, masked_lm_prob, From 5b1100a422be729014353b22590aca9c839319d9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:13:34 +0700 Subject: [PATCH 078/297] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 236 +++++++++--------------- 1 file changed, 83 insertions(+), 153 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 6425c8b72..adc752280 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -25,150 +25,72 @@ from megatron import mpu, print_rank_0, get_tokenizer from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_samples_mapping, create_masked_lm_predictions -from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_, get_indexed_dataset_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, - seq_length, seed, skip_warmup): - """Build train, valid, and test datasets.""" - - # Single dataset. + max_seq_length, + masked_lm_prob, short_seq_prob, seed, + skip_warmup, binary_head=False, + max_seq_length_dec=None, + dataset_type='standard_bert'): if len(data_prefix) == 1: - all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], + return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, train_valid_test_num_samples, - seq_length, seed, skip_warmup) - # Blending dataset. - else: - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, seed, skip_warmup) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - all_train_datasets = BlendableDataset(train_datasets, weights) \ - if train_datasets else None - all_valid_datasets = BlendableDataset(valid_datasets, weights) \ - if valid_datasets else None - all_test_datasets = BlendableDataset(test_datasets, weights) \ - if test_datasets else None - - return all_train_datasets, all_valid_datasets, all_test_datasets - - -def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, train_valid_test): - ''' - Build a single dataset group corresponding to Option 2 of data loading see arguments.py - a dataset group is passed on the following form - GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 - or alternatively - GIVEN_NAME PATH1 # for a single dataset to be used fully - ''' - - assert train_valid_test in ["train","valid","test"] - - # Single dataset. - if len(paths) == 1: - dataset = _build_single_datasets(paths[0], - splits[0], - data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - dataset_group_name, train_valid_test) - return dataset + max_seq_length, masked_lm_prob, + short_seq_prob, seed, + skip_warmup, + binary_head, + max_seq_length_dec, + dataset_type=dataset_type) # Blending dataset. - else: - - data_prefix = [] - # data_prefix is on the shape: - # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] - for w,p in zip(weights, paths): - data_prefix += [w,p] - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - datasets = [] - for i in range(len(prefixes)): - ds = _build_single_datasets(prefixes[i], - splits[i], - data_impl, - datasets_train_valid_test_num_samples[i], - seq_length, - seed, skip_warmup, - dataset_group_name, train_valid_test) - - datasets.append(ds) - all_datasets = BlendableDataset(datasets, weights) - - return all_datasets - -def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, - seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): - """Build a single dataset""" - - assert train_valid_test in ["train","valid","test"] - index = ["train","valid","test"].index(train_valid_test) - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - # this corresponds to option2 for data loading on the form - # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 - # splits here is an array of size 2 [start_index, end_index] - splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) - - # Print stats about the splits. - print_rank_0(' > dataset split:') - - print_rank_0(' {}:'.format(dataset_group_name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[0], splits[1], - splits[1] - splits[0])) - - def build_dataset(name): - dataset = None - if splits[1] > splits[0]: - documents = np.arange(start=splits[0], stop=splits[1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) - return dataset - - dataset = build_dataset(dataset_group_name) - - return dataset + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + max_seq_length, masked_lm_prob, short_seq_prob, + seed, skip_warmup, binary_head, dataset_type=dataset_type) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights) + + return (blending_train_dataset, blending_valid_dataset, + blending_test_dataset) def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, - seq_length, seed, skip_warmup): + max_seq_length, + masked_lm_prob, short_seq_prob, seed, + skip_warmup, binary_head, + max_seq_length_dec, + dataset_type='standard_bert'): """Build train, valid, and test datasets.""" @@ -177,8 +99,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, data_impl, skip_warmup) - total_num_of_documents = indexed_dataset.sizes.shape[0] - # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + total_num_of_documents = indexed_dataset.sizes.shape[0] - 1 splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. print_rank_0(' > dataset split:') @@ -188,6 +109,11 @@ def print_split_stats(name, index): print_rank_0(' document indices in [{}, {}) total of {} ' 'documents'.format(splits[index], splits[index + 1], splits[index + 1] - splits[index])) + start_index = indexed_dataset.doc_idx[splits[index]] + end_index = indexed_dataset.doc_idx[splits[index + 1]] + print_rank_0(' sentence indices in [{}, {}) total of {} ' + 'sentences'.format(start_index, end_index, + end_index - start_index)) print_split_stats('train', 0) print_split_stats('validation', 1) print_split_stats('test', 2) @@ -195,12 +121,30 @@ def print_split_stats(name, index): def build_dataset(index, name): dataset = None if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], - step=1, dtype=np.int32) - dataset = NonCausalMLMDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) + # Get the pointer to the original doc-idx so we can set it later. + doc_idx_ptr = indexed_dataset.get_doc_idx() + # Slice the doc-idx + start_index = splits[index] + # Add +1 so we can index into the dataset to get the upper bound. + end_index = splits[index + 1] + 1 + # New doc_idx view. + indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index]) + # Build the dataset accordingly. + kwargs = dict( + name=name, + data_prefix=data_prefix, + num_epochs=None, + max_num_samples=train_valid_test_num_samples[index], + max_seq_length=max_seq_length, + seed=seed, + ) + dataset = NonCausalMLMDataset( + indexed_dataset=indexed_dataset, + masked_lm_prob=masked_lm_prob, + max_seq_length_dec=max_seq_length_dec, + short_seq_prob=short_seq_prob, + **kwargs + ) return dataset train_dataset = build_dataset(0, 'train') @@ -210,20 +154,6 @@ def build_dataset(index, name): return (train_dataset, valid_dataset, test_dataset) -def get_indexed_dataset_(path, data_impl, skip_warmup): - """Build indexed dataset.""" - print_rank_0(' > building dataset index ...') - start_time = time.time() - indexed_dataset = make_indexed_dataset(path, - data_impl, - skip_warmup) - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - print_rank_0(' number of documents: {}'.format( - indexed_dataset.sizes.shape[0])) - - return indexed_dataset - class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, From 45102a93e0dd2be089467e616b79112742ea7887 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:14:54 +0700 Subject: [PATCH 079/297] try t5 dataset --- train_non_causal_mlm_adaptation_gpt.py | 56 +++++++++++++------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 3a12e0b95..7bbb4308d 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -205,34 +205,34 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): skip_warmup=(not args.mmap_warmup), dataset_type='t5') - # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - elif args.train_weighted_split_paths: - assigned_train_valid_test = [] - if args.train_weighted_split_paths is not None: - train_ds = [] - assigned_train_valid_test.append("train") - if args.valid_weighted_split_paths is not None: - valid_ds = [] - assigned_train_valid_test.append("valid") - if args.test_weighted_split_paths is not None: - test_ds = [] - assigned_train_valid_test.append("test") - - for s in assigned_train_valid_test: - data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - eval(f"args.{s}_weighted_split_weights"), - eval(f"args.{s}_weighted_split_splits"), - eval(f"args.{s}_weighted_split_names")) - for paths, weights, splits, name in data_groups: - d = build_dataset_group(name, paths, weights, splits, - args.data_impl, - train_val_test_num_samples, - args.seq_length, args.seed, - (not args.mmap_warmup), - train_valid_test=s) - eval(f"{s}_ds").append(d) - else: - raise NotImplementedError("No dataloading argument passed") + # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + # elif args.train_weighted_split_paths: + # assigned_train_valid_test = [] + # if args.train_weighted_split_paths is not None: + # train_ds = [] + # assigned_train_valid_test.append("train") + # if args.valid_weighted_split_paths is not None: + # valid_ds = [] + # assigned_train_valid_test.append("valid") + # if args.test_weighted_split_paths is not None: + # test_ds = [] + # assigned_train_valid_test.append("test") + + # for s in assigned_train_valid_test: + # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + # eval(f"args.{s}_weighted_split_weights"), + # eval(f"args.{s}_weighted_split_splits"), + # eval(f"args.{s}_weighted_split_names")) + # for paths, weights, splits, name in data_groups: + # d = build_dataset_group(name, paths, weights, splits, + # args.data_impl, + # train_val_test_num_samples, + # args.seq_length, args.seed, + # (not args.mmap_warmup), + # train_valid_test=s) + # eval(f"{s}_ds").append(d) + # else: + # raise NotImplementedError("No dataloading argument passed") print_rank_0("> finished creating GPT datasets ...") return train_ds, valid_ds, test_ds From 7b2ebbf79e937832ed5617ab801e4aa6209c49d0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:15:18 +0700 Subject: [PATCH 080/297] try t5 dataset --- train_non_causal_mlm_adaptation_gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 7bbb4308d..3175f35da 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -23,7 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From fe8b3dc0829dca4f5f51500f6fd60c0a5789dd9c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:16:50 +0700 Subject: [PATCH 081/297] try t5 dataset --- train_non_causal_mlm_adaptation_gpt.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 3175f35da..6e57dfd15 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -23,7 +23,8 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets +#, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From f456725e184697d31f6b43ff86f56bcf04281452 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:18:27 +0700 Subject: [PATCH 082/297] try t5 dataset --- examples/train_non_causal_mlm_adaption.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh index b47ea7142..6cb1c38a0 100644 --- a/examples/train_non_causal_mlm_adaption.sh +++ b/examples/train_non_causal_mlm_adaption.sh @@ -9,7 +9,7 @@ DATA_PATH=data/mc4-id_text_document CHECKPOINT_PATH=data -deepspeed --num_gpus 8 train_non_causal_mlm_adaption_mlm.py \ +deepspeed --num_gpus 8 train_non_causal_mlm_adaption_gpt.py \ --num-layers 2 \ --hidden-size 128 \ --num-attention-heads 4 \ From ae73d8cf0673e36b11023a44252ea82183ae843c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:19:16 +0700 Subject: [PATCH 083/297] try t5 dataset --- examples/train_non_causal_mlm_adaption.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh index 6cb1c38a0..e150e3cee 100644 --- a/examples/train_non_causal_mlm_adaption.sh +++ b/examples/train_non_causal_mlm_adaption.sh @@ -9,7 +9,7 @@ DATA_PATH=data/mc4-id_text_document CHECKPOINT_PATH=data -deepspeed --num_gpus 8 train_non_causal_mlm_adaption_gpt.py \ +deepspeed --num_gpus 8 train_non_causal_mlm_adaptation_gpt.py \ --num-layers 2 \ --hidden-size 128 \ --num-attention-heads 4 \ From fae6a0bd60303fe77582514e1ac58ba4738f7d69 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:29:01 +0700 Subject: [PATCH 084/297] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 65 +++++++++++++------------ train_non_causal_mlm_adaptation_gpt.py | 3 +- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index adc752280..b8b17a8a6 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -47,38 +47,39 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, max_seq_length_dec, dataset_type=dataset_type) # Blending dataset. - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - max_seq_length, masked_lm_prob, short_seq_prob, - seed, skip_warmup, binary_head, dataset_type=dataset_type) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights) + else: + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + max_seq_length, masked_lm_prob, short_seq_prob, + seed, skip_warmup, binary_head, dataset_type=dataset_type) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights) return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 6e57dfd15..3175f35da 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -23,8 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets -#, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ From 818584244126445c822c6ae3573246bee40b161d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:39:19 +0700 Subject: [PATCH 085/297] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 67 ++++++++++++------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index b8b17a8a6..e3db5f00c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -47,39 +47,38 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, max_seq_length_dec, dataset_type=dataset_type) # Blending dataset. - else: - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - max_seq_length, masked_lm_prob, short_seq_prob, - seed, skip_warmup, binary_head, dataset_type=dataset_type) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights) + # Parse the values. + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + prefixes[i], data_impl, splits_string, + datasets_train_valid_test_num_samples[i], + max_seq_length, masked_lm_prob, short_seq_prob, + seed, skip_warmup, binary_head, dataset_type=dataset_type) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + # Blend. + blending_train_dataset = None + if train_datasets: + blending_train_dataset = BlendableDataset(train_datasets, weights) + blending_valid_dataset = None + if valid_datasets: + blending_valid_dataset = BlendableDataset(valid_datasets, weights) + blending_test_dataset = None + if test_datasets: + blending_test_dataset = BlendableDataset(test_datasets, weights) return (blending_train_dataset, blending_valid_dataset, blending_test_dataset) @@ -100,7 +99,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, data_impl, skip_warmup) - total_num_of_documents = indexed_dataset.sizes.shape[0] - 1 + total_num_of_documents = indexed_dataset.sizes.shape[0]# - 1 splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. print_rank_0(' > dataset split:') From 9deef493b15bdf830ca825c86e7b2c075d09694e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:45:51 +0700 Subject: [PATCH 086/297] try t5 dataset --- megatron/data/non_causal_mlm_dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e3db5f00c..6fd2fef81 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -36,6 +36,9 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, skip_warmup, binary_head=False, max_seq_length_dec=None, dataset_type='standard_bert'): + print(len(data_prefix)) + import sys + sys.exit() if len(data_prefix) == 1: return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, @@ -99,7 +102,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, data_impl, skip_warmup) - total_num_of_documents = indexed_dataset.sizes.shape[0]# - 1 + total_num_of_documents = indexed_dataset.sizes.shape[0] - 1 splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. print_rank_0(' > dataset split:') From 1e78a4bd1186d053a7f81e1cdf586ca22305f95e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 01:54:53 +0700 Subject: [PATCH 087/297] developing --- megatron/data/non_causal_mlm_dataset.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 6fd2fef81..e45966fa8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -36,9 +36,6 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, skip_warmup, binary_head=False, max_seq_length_dec=None, dataset_type='standard_bert'): - print(len(data_prefix)) - import sys - sys.exit() if len(data_prefix) == 1: return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, @@ -148,6 +145,11 @@ def build_dataset(index, name): short_seq_prob=short_seq_prob, **kwargs ) + indexed_dataset.set_doc_idx(doc_idx_ptr) + # Checks. + assert indexed_dataset.doc_idx[0] == 0 + assert indexed_dataset.doc_idx.shape[0] == \ + (total_num_of_documents + 1) return dataset train_dataset = build_dataset(0, 'train') From 9070929d72a8361d406d60c103716af05b234341 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:03:09 +0700 Subject: [PATCH 088/297] developing --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e45966fa8..231275b8c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -147,9 +147,9 @@ def build_dataset(index, name): ) indexed_dataset.set_doc_idx(doc_idx_ptr) # Checks. - assert indexed_dataset.doc_idx[0] == 0 - assert indexed_dataset.doc_idx.shape[0] == \ - (total_num_of_documents + 1) + # assert indexed_dataset.doc_idx[0] == 0 + # assert indexed_dataset.doc_idx.shape[0] == \ + # (total_num_of_documents + 1) return dataset train_dataset = build_dataset(0, 'train') From 56c69de0c3adb49b97201d68ad3f51ceb32f124a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:04:37 +0700 Subject: [PATCH 089/297] developing --- megatron/data/non_causal_mlm_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 231275b8c..063cb9a61 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -277,18 +277,18 @@ def build_training_sample(sample, target_seq_length, max_seq_length_dec, masked_spans, bos_id, eos_id, sentinel_tokens) - #text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) + # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - text_tokens_ids = input_tokens_ids + output_tokens_ids + # text_tokens_ids = input_tokens_ids + output_tokens_ids print("input_tokens_ids") print(len(input_tokens_ids)) print(input_tokens_ids) print("output_tokens_ids") print(len(output_tokens_ids)) print(output_tokens_ids) - print("text_tokens_ids") - # print(text_tokens_ids) - print(len(text_tokens_ids)) + # print("text_tokens_ids") + # # print(text_tokens_ids) + # print(len(text_tokens_ids)) # input_tokens_ids = pad_and_convert_to_numpy( # input_tokens_ids, From d1ca91438ffd430602539423156e3601079b56e3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:24:27 +0700 Subject: [PATCH 090/297] developing --- examples/train_non_causal_mlm_adaption.sh | 1 + megatron/data/non_causal_mlm_dataset.py | 38 ++++++++++++++++++----- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh index e150e3cee..a595fe161 100644 --- a/examples/train_non_causal_mlm_adaption.sh +++ b/examples/train_non_causal_mlm_adaption.sh @@ -39,4 +39,5 @@ deepspeed --num_gpus 8 train_non_causal_mlm_adaptation_gpt.py \ --eval-interval 1000 \ --eval-iters 10 \ --fp16 \ + --loss-on-targets-only \ --tensorboard-dir LOG diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 063cb9a61..a02c27e12 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -269,13 +269,37 @@ def build_training_sample(sample, target_seq_length, cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, max_ngrams=10, geometric_dist=True, masking_style="t5") - # Padding. - input_tokens_ids, output_tokens_ids, labels, enc_mask, \ - dec_mask, enc_dec_mask, loss_mask \ - = pad_and_convert_to_numpy(tokens, masked_positions, - masked_labels, pad_id, max_seq_length, - max_seq_length_dec, masked_spans, - bos_id, eos_id, sentinel_tokens) + sentinel_tokens = collections.deque(sentinel_tokens) + input_tokens_ids = [] + output_tokens_ids = [] #[bos_id] + (start_index, end_index) = (0, None) + for span in masked_spans: + flag = sentinel_tokens.popleft() + + output_tokens_ids.append(flag) + output_tokens_ids.extend(span.label) + + end_index = span.index[0] + input_tokens_ids.extend(tokens[start_index: end_index]) + input_tokens_ids.append(flag) + + # the next start index is the token after the last span token + start_index = span.index[-1] + 1 + + + # Add the remaining tokens to input_tokens_ids + input_tokens_ids.extend(tokens[start_index:]) + # Add token to the output_tokens_ids + output_tokens_ids.append(eos_id) + prefix_len = len(input_tokens_ids) + + # # Padding. + # input_tokens_ids, _, output_tokens_ids, enc_mask, \ + # dec_mask, enc_dec_mask, loss_mask \ + # = pad_and_convert_to_numpy(tokens, masked_positions, + # masked_labels, pad_id, max_seq_length, + # max_seq_length_dec, masked_spans, + # bos_id, eos_id, sentinel_tokens) # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) From 13af6234f7d26c706f27ed4f3faca61035fed406 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:34:17 +0700 Subject: [PATCH 091/297] developing --- megatron/data/non_causal_mlm_dataset.py | 169 +++++++++++++----------- 1 file changed, 90 insertions(+), 79 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index a02c27e12..ef5eeea82 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -289,6 +289,7 @@ def build_training_sample(sample, target_seq_length, # Add the remaining tokens to input_tokens_ids input_tokens_ids.extend(tokens[start_index:]) + input_tokens_ids.append(eos_id) # Add token to the output_tokens_ids output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) @@ -303,28 +304,26 @@ def build_training_sample(sample, target_seq_length, # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - # text_tokens_ids = input_tokens_ids + output_tokens_ids + text_tokens_ids = pad_and_convert_to_numpy( + input_tokens_ids+output_tokens_ids, + pad_id, + max_seq_length+max_seq_length_dec + ) + print("input_tokens_ids") print(len(input_tokens_ids)) print(input_tokens_ids) print("output_tokens_ids") print(len(output_tokens_ids)) print(output_tokens_ids) - # print("text_tokens_ids") - # # print(text_tokens_ids) - # print(len(text_tokens_ids)) - - # input_tokens_ids = pad_and_convert_to_numpy( - # input_tokens_ids, - # self.tokenizer.pad, - # self.seq_length - # ) - + print("text_tokens_ids") + print(text_tokens_ids) + print(len(text_tokens_ids)) import sys sys.exit() return { - 'text': input_tokens_ids, + 'text': text_tokens_ids, 'prefix_len': prefix_len } @@ -341,78 +340,90 @@ def build_training_sample(sample, target_seq_length, # } # return train_sample - -def pad_and_convert_to_numpy(tokens, masked_positions, - masked_labels, pad_id, - max_seq_length, max_seq_length_dec, - masked_spans=None, bos_id=None, - eos_id=None, sentinel_tokens=None): +def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" - sentinel_tokens = collections.deque(sentinel_tokens) - t5_input = [] - (t5_decoder_in, t5_decoder_out) = ([bos_id], []) - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) - t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) - t5_decoder_out.extend(span.label) - - end_index = span.index[0] - t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 - - # Add token to the t5_decoder_out - t5_decoder_out.append(eos_id) - - # Add the remaining tokens to the t5 input - t5_input.extend(tokens[start_index:]) - - # assert (len(t5_input) - len(masked_spans)) + \ - # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) - # Some checks. - - # Encoder-side padding mask. - num_tokens = len(t5_input) + num_tokens = len(tokens) padding_length = max_seq_length - num_tokens assert padding_length >= 0 - assert len(masked_positions) == len(masked_labels) - - # Tokens.. - filler = [pad_id] * padding_length - tokens_enc = np.array(t5_input + filler, dtype=np.int64) - - # Decoder-side padding mask. - num_tokens_dec = len(t5_decoder_in) - padding_length_dec = max_seq_length_dec - num_tokens_dec - assert padding_length_dec >= 0 - filler_dec = [pad_id] * padding_length_dec - tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) - - # Create attention masks - enc_mask = make_attention_mask(tokens_enc, tokens_enc) - enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) - dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) - dec_mask = dec_mask * make_history_mask(tokens_dec_in) - - # Labels mask. - labels = t5_decoder_out + ([-1] * padding_length_dec) - labels = np.array(labels, dtype=np.int64) - - # Loss mask - loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) - loss_mask = np.array(loss_mask, dtype=np.int64) - - return tokens_enc, tokens_dec_in, labels, enc_mask, \ - dec_mask, enc_dec_mask, loss_mask + + # Tokens and token types. + filler = np.array([pad_id] * padding_length) + tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + + return tokens_np +# def pad_and_convert_to_numpy(tokens, masked_positions, +# masked_labels, pad_id, +# max_seq_length, max_seq_length_dec, +# masked_spans=None, bos_id=None, +# eos_id=None, sentinel_tokens=None): +# """Pad sequences and convert them to numpy.""" + +# sentinel_tokens = collections.deque(sentinel_tokens) +# t5_input = [] +# (t5_decoder_in, t5_decoder_out) = ([bos_id], []) +# (start_index, end_index) = (0, None) +# for span in masked_spans: +# flag = sentinel_tokens.popleft() + +# # Append the same tokens in decoder input and output +# t5_decoder_in.append(flag) +# t5_decoder_in.extend(span.label) +# t5_decoder_out.append(flag) +# t5_decoder_out.extend(span.label) + +# end_index = span.index[0] +# t5_input.extend(tokens[start_index: end_index]) +# t5_input.append(flag) + +# # the next start index is the token after the last span token +# start_index = span.index[-1] + 1 + +# # Add token to the t5_decoder_out +# t5_decoder_out.append(eos_id) + +# # Add the remaining tokens to the t5 input +# t5_input.extend(tokens[start_index:]) + +# # assert (len(t5_input) - len(masked_spans)) + \ +# # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) + +# # Some checks. + +# # Encoder-side padding mask. +# num_tokens = len(t5_input) +# padding_length = max_seq_length - num_tokens +# assert padding_length >= 0 +# assert len(masked_positions) == len(masked_labels) + +# # Tokens.. +# filler = [pad_id] * padding_length +# tokens_enc = np.array(t5_input + filler, dtype=np.int64) + +# # Decoder-side padding mask. +# num_tokens_dec = len(t5_decoder_in) +# padding_length_dec = max_seq_length_dec - num_tokens_dec +# assert padding_length_dec >= 0 +# filler_dec = [pad_id] * padding_length_dec +# tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) + +# # Create attention masks +# enc_mask = make_attention_mask(tokens_enc, tokens_enc) +# enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) +# dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) +# dec_mask = dec_mask * make_history_mask(tokens_dec_in) + +# # Labels mask. +# labels = t5_decoder_out + ([-1] * padding_length_dec) +# labels = np.array(labels, dtype=np.int64) + +# # Loss mask +# loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) +# loss_mask = np.array(loss_mask, dtype=np.int64) + +# return tokens_enc, tokens_dec_in, labels, enc_mask, \ +# dec_mask, enc_dec_mask, loss_mask def make_attention_mask(source_block, target_block): From dbc555e138982d784304171c5a57f63ac71a17c6 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:35:50 +0700 Subject: [PATCH 092/297] developing --- megatron/data/non_causal_mlm_dataset.py | 147 +----------------------- 1 file changed, 1 insertion(+), 146 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ef5eeea82..0380d1623 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -294,52 +294,18 @@ def build_training_sample(sample, target_seq_length, output_tokens_ids.append(eos_id) prefix_len = len(input_tokens_ids) - # # Padding. - # input_tokens_ids, _, output_tokens_ids, enc_mask, \ - # dec_mask, enc_dec_mask, loss_mask \ - # = pad_and_convert_to_numpy(tokens, masked_positions, - # masked_labels, pad_id, max_seq_length, - # max_seq_length_dec, masked_spans, - # bos_id, eos_id, sentinel_tokens) - - # text_tokens_ids = np.array(input_tokens_ids+output_tokens_ids) - text_tokens_ids = pad_and_convert_to_numpy( input_tokens_ids+output_tokens_ids, pad_id, max_seq_length+max_seq_length_dec ) - print("input_tokens_ids") - print(len(input_tokens_ids)) - print(input_tokens_ids) - print("output_tokens_ids") - print(len(output_tokens_ids)) - print(output_tokens_ids) - print("text_tokens_ids") - print(text_tokens_ids) - print(len(text_tokens_ids)) - import sys - sys.exit() - return { 'text': text_tokens_ids, 'prefix_len': prefix_len } - # train_sample = { - # 'text_enc': tokens_enc, - # 'text_dec': tokens_dec_in, - # 'labels': labels, - # 'loss_mask': loss_mask, - # 'truncated': int(truncated), - # 'enc_mask': enc_mask, - # 'dec_mask': dec_mask, - # 'enc_dec_mask': enc_dec_mask, - # } - # return train_sample - def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" @@ -352,115 +318,4 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): filler = np.array([pad_id] * padding_length) tokens_np = np.concatenate((tokens, filler), dtype=np.int64) - return tokens_np -# def pad_and_convert_to_numpy(tokens, masked_positions, -# masked_labels, pad_id, -# max_seq_length, max_seq_length_dec, -# masked_spans=None, bos_id=None, -# eos_id=None, sentinel_tokens=None): -# """Pad sequences and convert them to numpy.""" - -# sentinel_tokens = collections.deque(sentinel_tokens) -# t5_input = [] -# (t5_decoder_in, t5_decoder_out) = ([bos_id], []) -# (start_index, end_index) = (0, None) -# for span in masked_spans: -# flag = sentinel_tokens.popleft() - -# # Append the same tokens in decoder input and output -# t5_decoder_in.append(flag) -# t5_decoder_in.extend(span.label) -# t5_decoder_out.append(flag) -# t5_decoder_out.extend(span.label) - -# end_index = span.index[0] -# t5_input.extend(tokens[start_index: end_index]) -# t5_input.append(flag) - -# # the next start index is the token after the last span token -# start_index = span.index[-1] + 1 - -# # Add token to the t5_decoder_out -# t5_decoder_out.append(eos_id) - -# # Add the remaining tokens to the t5 input -# t5_input.extend(tokens[start_index:]) - -# # assert (len(t5_input) - len(masked_spans)) + \ -# # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) - -# # Some checks. - -# # Encoder-side padding mask. -# num_tokens = len(t5_input) -# padding_length = max_seq_length - num_tokens -# assert padding_length >= 0 -# assert len(masked_positions) == len(masked_labels) - -# # Tokens.. -# filler = [pad_id] * padding_length -# tokens_enc = np.array(t5_input + filler, dtype=np.int64) - -# # Decoder-side padding mask. -# num_tokens_dec = len(t5_decoder_in) -# padding_length_dec = max_seq_length_dec - num_tokens_dec -# assert padding_length_dec >= 0 -# filler_dec = [pad_id] * padding_length_dec -# tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) - -# # Create attention masks -# enc_mask = make_attention_mask(tokens_enc, tokens_enc) -# enc_dec_mask = make_attention_mask(tokens_dec_in, tokens_enc) -# dec_mask = make_attention_mask(tokens_dec_in, tokens_dec_in) -# dec_mask = dec_mask * make_history_mask(tokens_dec_in) - -# # Labels mask. -# labels = t5_decoder_out + ([-1] * padding_length_dec) -# labels = np.array(labels, dtype=np.int64) - -# # Loss mask -# loss_mask = ([1] * num_tokens_dec) + ([0] * padding_length_dec) -# loss_mask = np.array(loss_mask, dtype=np.int64) - -# return tokens_enc, tokens_dec_in, labels, enc_mask, \ -# dec_mask, enc_dec_mask, loss_mask - - -def make_attention_mask(source_block, target_block): - """ - Returns a 2-dimensional (2-D) attention mask - :param source_block: 1-D array - :param target_block: 1-D array - """ - mask = (target_block[None, :] >= 1) * (source_block[:, None] >= 1) - mask = mask.astype(np.int64) - # (source_length, target_length) - return mask - - -def make_attention_mask_3d(source_block, target_block): - """ - Returns a 3-dimensional (3-D) attention mask - :param source_block: 1-D array - :param target_block: 1-D array - """ - mask = (target_block[:, None, :] >= 1) * (source_block[:, :, None] >= 1) - # (batch, source_length, target_length) - # mask = mask.astype(np.int64) - return mask - - -def make_history_mask(block): - length = block.shape[0] - arange = np.arange(length) - history_mask = (arange[None, ] <= arange[:, None]) - history_mask = history_mask.astype(np.int64) - return history_mask - - -def make_history_mask_3d(block): - batch, length = block.shape - arange = torch.arange(length, device=block.device) - history_mask = (arange[None, ] <= arange[:, None])[None, ] - history_mask = history_mask.expand(batch, length, length) - return history_mask + return tokens_np \ No newline at end of file From 12b209dde8abe3f61082c418ca33956d318fd2c3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 02:44:33 +0700 Subject: [PATCH 093/297] developing --- megatron/tokenizer/tokenizer.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 7079ad353..de0dc8fc0 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -370,6 +370,21 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), + AddedToken('', lstrip=False, rstrip=False, normalization=False), ] }) From 698eff056346c8f0c403433310c692a868efdb50 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 11:44:13 +0700 Subject: [PATCH 094/297] test to see output of get_ltor_masks_and_position_ids --- megatron/tokenizer/tokenizer.py | 12 ------------ train_non_causal_mlm_adaptation_gpt.py | 9 +++++++++ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index de0dc8fc0..f9cec21ab 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -331,18 +331,6 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} - # if 'mask_token' not in self.tokenizer.special_tokens_map: - # self.tokenizer.add_tokens("") - # self.tokenizer.mask_token = "" - - # if 'cls_token' not in self.tokenizer.special_tokens_map: - # self.tokenizer.add_tokens("") - # self.tokenizer.mask_token = "" - - # if 'sep_token' not in self.tokenizer.special_tokens_map: - # self.tokenizer.add_tokens("") - # self.tokenizer.mask_token = "" - self.tokenizer.add_special_tokens({ 'additional_special_tokens': [ AddedToken('', lstrip=False, rstrip=False, normalization=False), diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index 3175f35da..b19e7c811 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -104,6 +104,15 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) + print("attention_mask") + print(attention_mask) + print("loss_mask") + print(loss_mask) + print("position_ids") + print(position_ids) + import sys + sys.exit() + # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) From dae3cc6ce86bd3c54a9b5723b209d7e9f6701e73 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 11:51:29 +0700 Subject: [PATCH 095/297] test to see output of get_ltor_masks_and_position_ids --- train_non_causal_mlm_adaptation_gpt.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_non_causal_mlm_adaptation_gpt.py index b19e7c811..62c461bad 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_non_causal_mlm_adaptation_gpt.py @@ -104,15 +104,16 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) - print("attention_mask") - print(attention_mask) - print("loss_mask") - print(loss_mask) - print("position_ids") - print(position_ids) + import numpy as np + with open('attention_mask.npy', 'wb') as f: + np.save(f, attention_mask.cpu().numpy()) + with open('loss_mask.npy', 'wb') as f: + np.save(f, loss_mask.cpu().numpy()) + with open('position_ids.npy', 'wb') as f: + np.save(f, position_ids.cpu().numpy()) import sys sys.exit() - + # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) From 5c109c3c4a0de1f8e10a56ee01bcdeebec1df454 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 23:11:53 +0700 Subject: [PATCH 096/297] add new script --- examples/4B8-en.sh | 153 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 examples/4B8-en.sh diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh new file mode 100644 index 000000000..4d86fb2d6 --- /dev/null +++ b/examples/4B8-en.sh @@ -0,0 +1,153 @@ +#!/bin/bash + +EXPERIMENT_NAME=4B8-en +REPO_PATH=experiments/$EXPERIMENT_NAME +CHECKPOINT_PATH=$REPO_PATH/checkpoints +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs + +DATA_PATH=data/mc4-id_text_document + + +# XXX: edit me +GPUS_PER_NODE=8 +NNODES=1 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=2048 +TRAIN_ITER=131_072 + +NLAYERS=24 +NHIDDEN=4096 +NHEADS=64 +FFN_HIDDEN_SIZE=10240 +SEQ_LEN=626 + +if [[ ${ROUND} == 1 ]]; then EXIT_INTERVAL=100 SAVE_INTERVAL=10 +elif [[ ${ROUND} == 2 ]]; then SAVE_INTERVAL=1500 +else echo "invalid ROUND: $ROUND" +fi + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --lr-decay-samples 146_484_375 \ + --lr-warmup-samples 183_105 \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --seq-length $SEQ_LEN \ + --max-position-embeddings $SEQ_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --rampup-batch-size 32 32 2_000_000 \ + --train-samples $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path bigscience/tokenizer \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval 1000 \ + --eval-iters 100 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.$SLURM_JOBID.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +export LAUNCHER="python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + " + # --nnodes $NNODES \ + # --master_addr $MASTER_ADDR \ + # --master_port $MASTER_PORT \ + +export CMD=" \ + `pwd`/pretrain_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $REPO_PATH +mkdir -p $LOGS_PATH +# to debug - add echo (it exits and prints what it would have launched) +srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file From 2fc99951e2a14a3db4df4fc3ac412ab011e2110d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 23:13:32 +0700 Subject: [PATCH 097/297] add new script --- examples/4B8-en.sh | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index 4d86fb2d6..21f48cbfe 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -117,12 +117,12 @@ DEEPSPEED_ARGS=" \ --deepspeed-activation-checkpointing \ " -export LAUNCHER="python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ - " - # --nnodes $NNODES \ - # --master_addr $MASTER_ADDR \ - # --master_port $MASTER_PORT \ +# export LAUNCHER="python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# " +# # --nnodes $NNODES \ +# # --master_addr $MASTER_ADDR \ +# # --master_port $MASTER_PORT \ export CMD=" \ `pwd`/pretrain_gpt.py \ @@ -150,4 +150,9 @@ echo $CMD mkdir -p $REPO_PATH mkdir -p $LOGS_PATH # to debug - add echo (it exits and prints what it would have launched) -srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file + +python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + $CMD + +# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file From ee7af99a3fc128cea0c51b2829f2690ad7f3615a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sun, 29 May 2022 23:14:27 +0700 Subject: [PATCH 098/297] add new script --- examples/4B8-en.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index 21f48cbfe..c34176c98 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -27,10 +27,7 @@ NHEADS=64 FFN_HIDDEN_SIZE=10240 SEQ_LEN=626 -if [[ ${ROUND} == 1 ]]; then EXIT_INTERVAL=100 SAVE_INTERVAL=10 -elif [[ ${ROUND} == 2 ]]; then SAVE_INTERVAL=1500 -else echo "invalid ROUND: $ROUND" -fi +SAVE_INTERVAL=1500 OPTIMIZER_ARGS=" \ --optimizer adam \ From b6701a851e81a25e5d23889d22a2b008f7ef464c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 30 May 2022 10:29:01 +0700 Subject: [PATCH 099/297] changed settings --- examples/4B8-en.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index c34176c98..fc8064450 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -37,8 +37,6 @@ OPTIMIZER_ARGS=" \ --lr 2e-4 \ --min-lr 1e-5 \ --lr-decay-style cosine \ - --lr-decay-samples 146_484_375 \ - --lr-warmup-samples 183_105 \ --clip-grad 1.0 \ --weight-decay 1e-1 \ " @@ -57,7 +55,7 @@ GPT_ARGS=" \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --rampup-batch-size 32 32 2_000_000 \ - --train-samples $TRAIN_ITER \ + --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path bigscience/tokenizer \ --loss-scale 12 \ @@ -71,8 +69,8 @@ GPT_ARGS=" \ OUTPUT_ARGS=" \ --log-interval 200 \ --save-interval $SAVE_INTERVAL \ - --eval-interval 1000 \ - --eval-iters 100 \ + --eval-interval 0 \ + --eval-iters 0 \ --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ From 2283e581b73425036a759c6a925b8556b75c1fee Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 30 May 2022 10:32:03 +0700 Subject: [PATCH 100/297] changed settings --- examples/4B8-en.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/4B8-en.sh b/examples/4B8-en.sh index fc8064450..5a44d30d8 100644 --- a/examples/4B8-en.sh +++ b/examples/4B8-en.sh @@ -54,7 +54,6 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ - --rampup-batch-size 32 32 2_000_000 \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path bigscience/tokenizer \ From 9d00a49ffbce5e0f05f799040832c72fe425bc00 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 31 May 2022 23:11:51 +0700 Subject: [PATCH 101/297] tidy up --- examples/4B8-en.sh => 4B8-en-CD-FLM.sh | 31 +- 4B8-en-ND-MLM.sh | 154 ++++++++++ 4B8-en-ND-MTF.sh | 154 ++++++++++ examples/finetune_mp3.sh | 42 --- examples/train_non_causal_mlm_adaption.sh | 43 --- ...m_adaptation_gpt.py => train_ND_MLM_gpt.py | 22 +- train_ND_MTF_gpt.py | 287 ++++++++++++++++++ 7 files changed, 625 insertions(+), 108 deletions(-) rename examples/4B8-en.sh => 4B8-en-CD-FLM.sh (89%) create mode 100644 4B8-en-ND-MLM.sh create mode 100644 4B8-en-ND-MTF.sh delete mode 100644 examples/finetune_mp3.sh delete mode 100644 examples/train_non_causal_mlm_adaption.sh rename train_non_causal_mlm_adaptation_gpt.py => train_ND_MLM_gpt.py (94%) create mode 100644 train_ND_MTF_gpt.py diff --git a/examples/4B8-en.sh b/4B8-en-CD-FLM.sh similarity index 89% rename from examples/4B8-en.sh rename to 4B8-en-CD-FLM.sh index 5a44d30d8..f8963414c 100644 --- a/examples/4B8-en.sh +++ b/4B8-en-CD-FLM.sh @@ -1,14 +1,13 @@ #!/bin/bash -EXPERIMENT_NAME=4B8-en +EXPERIMENT_NAME=4B8-en-CD-FLM REPO_PATH=experiments/$EXPERIMENT_NAME CHECKPOINT_PATH=$REPO_PATH/checkpoints TENSORBOARD_PATH=$REPO_PATH/tensorboard CODECARBON_PATH=$REPO_PATH/codecarbon LOGS_PATH=$REPO_PATH/logs -DATA_PATH=data/mc4-id_text_document - +DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document # XXX: edit me GPUS_PER_NODE=8 @@ -17,15 +16,17 @@ PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here TP_SIZE=1 # always fixed to the size of a single node DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer -MICRO_BATCH_SIZE=1 +MICRO_BATCH_SIZE=32 GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=131_072 +SEQ_LEN=626 + NLAYERS=24 NHIDDEN=4096 NHEADS=64 FFN_HIDDEN_SIZE=10240 -SEQ_LEN=626 +MAX_POSITION_EMBEDDING=1280 SAVE_INTERVAL=1500 @@ -45,13 +46,16 @@ EXIT_OPTS=" \ --exit-duration-in-mins 1190 \ " + + GPT_ARGS=" \ --num-layers $NLAYERS \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --seq-length $SEQ_LEN \ --max-position-embeddings $SEQ_LEN \ + --position-embedding-type alibi \ + --seq-length $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ @@ -66,10 +70,10 @@ GPT_ARGS=" \ " OUTPUT_ARGS=" \ - --log-interval 200 \ + --log-interval 1 \ --save-interval $SAVE_INTERVAL \ - --eval-interval 0 \ - --eval-iters 0 \ + --eval-interval $TRAIN_ITER \ + --eval-iters 1 \ --tensorboard-dir $TENSORBOARD_PATH \ --tensorboard-queue-size 5 \ --log-timers-to-tensorboard \ @@ -79,7 +83,7 @@ OUTPUT_ARGS=" \ ZERO_STAGE=1 -config_json="./ds_config.$SLURM_JOBID.json" +config_json="./ds_config.json" # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() cat < $config_json @@ -145,8 +149,11 @@ mkdir -p $REPO_PATH mkdir -p $LOGS_PATH # to debug - add echo (it exits and prints what it would have launched) -python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ +# python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# $CMD + +deepspeed --num_gpus $GPUS_PER_NODE \ $CMD # srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh new file mode 100644 index 000000000..5f96a39b1 --- /dev/null +++ b/4B8-en-ND-MLM.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +EXPERIMENT_NAME=4B8-en-ND-MLM +REPO_PATH=experiments/$EXPERIMENT_NAME +CHECKPOINT_PATH=$REPO_PATH/checkpoints +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs + +DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document + +# XXX: edit me +GPUS_PER_NODE=8 +NNODES=1 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=2048 +TRAIN_ITER=39_718 +INPUT_LEN=512 +TARGET_LEN=114 + +NLAYERS=24 +NHIDDEN=4096 +NHEADS=64 +FFN_HIDDEN_SIZE=10240 +MAX_POSITION_EMBEDDING=1280 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --encoder-seq-length $INPUT_LEN \ + --decoder-seq-length $TARGET_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-iters $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path t5-base \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval $TRAIN_ITER \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +# export LAUNCHER="python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# " +# # --nnodes $NNODES \ +# # --master_addr $MASTER_ADDR \ +# # --master_port $MASTER_PORT \ + +export CMD=" \ + `pwd`/train_ND_MLM_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $REPO_PATH +mkdir -p $LOGS_PATH +# to debug - add echo (it exits and prints what it would have launched) + +python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + $CMD + +# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/4B8-en-ND-MTF.sh b/4B8-en-ND-MTF.sh new file mode 100644 index 000000000..a532a96fa --- /dev/null +++ b/4B8-en-ND-MTF.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +EXPERIMENT_NAME=4B8-en-ND-MTF +REPO_PATH=experiments/$EXPERIMENT_NAME +CHECKPOINT_PATH=$REPO_PATH/checkpoints +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs + +DATA_PATH=data/mc4-id_text_document + +# XXX: edit me +GPUS_PER_NODE=8 +NNODES=1 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=1024 +TRAIN_ITER=10_000 +INPUT_LEN=1024 +TARGET_LEN=256 + +NLAYERS=24 +NHIDDEN=4096 +NHEADS=64 +FFN_HIDDEN_SIZE=10240 +MAX_POSITION_EMBEDDING=1280 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --encoder-seq-length $INPUT_LEN \ + --decoder-seq-length $TARGET_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-iters $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path t5-base \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval $TRAIN_ITER \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +# export LAUNCHER="python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# " +# # --nnodes $NNODES \ +# # --master_addr $MASTER_ADDR \ +# # --master_port $MASTER_PORT \ + +export CMD=" \ + `pwd`/train_ND_MTF_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $REPO_PATH +mkdir -p $LOGS_PATH +# to debug - add echo (it exits and prints what it would have launched) + +python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + $CMD + +# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/examples/finetune_mp3.sh b/examples/finetune_mp3.sh deleted file mode 100644 index 59cb34d4c..000000000 --- a/examples/finetune_mp3.sh +++ /dev/null @@ -1,42 +0,0 @@ -#! /bin/bash - -# Runs the "345M" parameter model - -RANK=0 -WORLD_SIZE=1 - -DATA_PATH=data/mc4-id_text_document -CHECKPOINT_PATH=data - - -deepspeed --num_gpus 8 pretrain_mp3_gpt.py \ - --num-layers 2 \ - --hidden-size 128 \ - --num-attention-heads 4 \ - --micro-batch-size 4 \ - --global-batch-size 1024 \ - --seq-length 626 \ - --max-position-embeddings 1024 \ - --train-iters 10000 \ - --lr-decay-iters 5000 \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - --lr 0.00015 \ - --min-lr 1.0e-5 \ - --lr-decay-style cosine \ - --weight-decay 1e-2 \ - --clip-grad 1.0 \ - --lr-warmup-fraction .01 \ - --checkpoint-activations \ - --log-interval 100 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 10 \ - --fp16 \ - --tensorboard-dir LOG diff --git a/examples/train_non_causal_mlm_adaption.sh b/examples/train_non_causal_mlm_adaption.sh deleted file mode 100644 index a595fe161..000000000 --- a/examples/train_non_causal_mlm_adaption.sh +++ /dev/null @@ -1,43 +0,0 @@ -#! /bin/bash - -# Runs the "345M" parameter model - -RANK=0 -WORLD_SIZE=1 - -DATA_PATH=data/mc4-id_text_document -CHECKPOINT_PATH=data - - -deepspeed --num_gpus 8 train_non_causal_mlm_adaptation_gpt.py \ - --num-layers 2 \ - --hidden-size 128 \ - --num-attention-heads 4 \ - --micro-batch-size 4 \ - --global-batch-size 1024 \ - --seq-length 626 \ - --max-position-embeddings 1024 \ - --train-iters 10000 \ - --lr-decay-iters 5000 \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - --lr 0.00015 \ - --min-lr 1.0e-5 \ - --lr-decay-style cosine \ - --weight-decay 1e-2 \ - --clip-grad 1.0 \ - --lr-warmup-fraction .01 \ - --checkpoint-activations \ - --log-interval 100 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 10 \ - --fp16 \ - --loss-on-targets-only \ - --tensorboard-dir LOG diff --git a/train_non_causal_mlm_adaptation_gpt.py b/train_ND_MLM_gpt.py similarity index 94% rename from train_non_causal_mlm_adaptation_gpt.py rename to train_ND_MLM_gpt.py index 62c461bad..0326e778a 100644 --- a/train_non_causal_mlm_adaptation_gpt.py +++ b/train_ND_MLM_gpt.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Pretrain GPT""" +"""Non-Causal Decoder GPT MLM Adaptation""" import torch from functools import partial @@ -52,6 +52,14 @@ def model_provider(pre_process=True, post_process=True): parallel_output=True, prefix_lm=True ) + # loaded_dir, state_dict = model[0].load_checkpoint( + # args.finetune, load_optimizer_states=False) + # if loaded_dir is None: + # print_rank_0('WARNING: could not find the metadata file {} '.format( + # load_dir)) + # print_rank_0(' will not load any checkpoints and will start from ' + # 'random') + # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe @@ -194,21 +202,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: - # train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - # data_prefix=args.data_path, - # data_impl=args.data_impl, - # splits_string=args.split, - # train_valid_test_num_samples=train_val_test_num_samples, - # seq_length=args.seq_length, - # seed=args.seed, - # skip_warmup=(not args.mmap_warmup)) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=512,#args.encoder_seq_length, - max_seq_length_dec=114,#args.decoder_seq_length, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, diff --git a/train_ND_MTF_gpt.py b/train_ND_MTF_gpt.py new file mode 100644 index 000000000..d16c9bb82 --- /dev/null +++ b/train_ND_MTF_gpt.py @@ -0,0 +1,287 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Non-Causal Decoder GPT Multitask Finetuning""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu + +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets #, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + prefix_lm=True + ) + # loaded_dir, state_dict = model[0].load_checkpoint( + # args.finetune, load_optimizer_states=False) + # if loaded_dir is None: + # print_rank_0('WARNING: could not find the metadata file {} '.format( + # load_dir)) + # print_rank_0(' will not load any checkpoints and will start from ' + # 'random') + + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=True + ) + see_memory_usage(f"After Building Model", force=True) + return model + +_KEYS = ['text', 'prefix_len'] + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + import numpy as np + with open('attention_mask.npy', 'wb') as f: + np.save(f, attention_mask.cpu().numpy()) + with open('loss_mask.npy', 'wb') as f: + np.save(f, loss_mask.cpu().numpy()) + with open('position_ids.npy', 'wb') as f: + np.save(f, position_ids.cpu().numpy()) + import sys + sys.exit() + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].cpu().tolist() + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + # train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + # data_prefix=args.data_path, + # data_impl=args.data_impl, + # splits_string=args.split, + # train_valid_test_num_samples=train_val_test_num_samples, + # seq_length=args.seq_length, + # seed=args.seed, + # skip_warmup=(not args.mmap_warmup)) + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + dataset_type='t5') + + # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + # elif args.train_weighted_split_paths: + # assigned_train_valid_test = [] + # if args.train_weighted_split_paths is not None: + # train_ds = [] + # assigned_train_valid_test.append("train") + # if args.valid_weighted_split_paths is not None: + # valid_ds = [] + # assigned_train_valid_test.append("valid") + # if args.test_weighted_split_paths is not None: + # test_ds = [] + # assigned_train_valid_test.append("test") + + # for s in assigned_train_valid_test: + # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + # eval(f"args.{s}_weighted_split_weights"), + # eval(f"args.{s}_weighted_split_splits"), + # eval(f"args.{s}_weighted_split_names")) + # for paths, weights, splits, name in data_groups: + # d = build_dataset_group(name, paths, weights, splits, + # args.data_impl, + # train_val_test_num_samples, + # args.seq_length, args.seed, + # (not args.mmap_warmup), + # train_valid_test=s) + # eval(f"{s}_ds").append(d) + # else: + # raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 0298fde935fa1ac1aa4bad630b2d3a452f859465 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 31 May 2022 23:15:04 +0700 Subject: [PATCH 102/297] changed tokenizer and position embedding --- 4B8-en-CD-FLM.sh | 2 -- 4B8-en-ND-MLM.sh | 5 +++-- 4B8-en-ND-MTF.sh | 5 +++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/4B8-en-CD-FLM.sh b/4B8-en-CD-FLM.sh index f8963414c..17079579d 100644 --- a/4B8-en-CD-FLM.sh +++ b/4B8-en-CD-FLM.sh @@ -46,8 +46,6 @@ EXIT_OPTS=" \ --exit-duration-in-mins 1190 \ " - - GPT_ARGS=" \ --num-layers $NLAYERS \ --hidden-size $NHIDDEN \ diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 5f96a39b1..583ff1893 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -51,14 +51,15 @@ GPT_ARGS=" \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --max-position-embeddings $SEQ_LEN \ + --position-embedding-type alibi \ --encoder-seq-length $INPUT_LEN \ --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path t5-base \ + --tokenizer-name-or-path bigscience/tokenizer \ --loss-scale 12 \ --clip-grad 1.0 \ --fp16 \ diff --git a/4B8-en-ND-MTF.sh b/4B8-en-ND-MTF.sh index a532a96fa..209732ad3 100644 --- a/4B8-en-ND-MTF.sh +++ b/4B8-en-ND-MTF.sh @@ -51,14 +51,15 @@ GPT_ARGS=" \ --hidden-size $NHIDDEN \ --num-attention-heads $NHEADS \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $MAX_POSITION_EMBEDDING \ + --max-position-embeddings $SEQ_LEN \ + --position-embedding-type alibi \ --encoder-seq-length $INPUT_LEN \ --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path t5-base \ + --tokenizer-name-or-path bigscience/tokenizer \ --loss-scale 12 \ --clip-grad 1.0 \ --fp16 \ From bde07f08867ddce6280e28be8cd272035421cbbb Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:31:45 +0700 Subject: [PATCH 103/297] modifying mlm to reflect original implementation --- 4B8-en-ND-MLM.sh | 3 +- megatron/data/non_causal_mlm_dataset.py | 231 ++++++++++++++++++--- megatron/tokenizer/tokenizer.py | 55 +---- prepare_tokenizer.py | 16 ++ pretrain_mp3_gpt.py | 257 ------------------------ 5 files changed, 223 insertions(+), 339 deletions(-) create mode 100644 prepare_tokenizer.py delete mode 100644 pretrain_mp3_gpt.py diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 583ff1893..c8e1ba0d6 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -8,6 +8,7 @@ CODECARBON_PATH=$REPO_PATH/codecarbon LOGS_PATH=$REPO_PATH/logs DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document +TOKENIZER_PATH=bigscience-tokenizer-padded # XXX: edit me GPUS_PER_NODE=8 @@ -59,7 +60,7 @@ GPT_ARGS=" \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ + --tokenizer-name-or-path $TOKENIZER_PATH \ --loss-scale 12 \ --clip-grad 1.0 \ --fp16 \ diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0380d1623..ed633fb32 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -262,38 +262,63 @@ def build_training_sample(sample, target_seq_length, truncated = len(tokens) > max_num_tokens tokens = tokens[:max_num_tokens] - # Masking. - max_predictions_per_seq = masked_lm_prob * max_num_tokens - (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( - tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, - cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, - max_ngrams=10, geometric_dist=True, masking_style="t5") - - sentinel_tokens = collections.deque(sentinel_tokens) - input_tokens_ids = [] - output_tokens_ids = [] #[bos_id] - (start_index, end_index) = (0, None) - for span in masked_spans: - flag = sentinel_tokens.popleft() - - output_tokens_ids.append(flag) - output_tokens_ids.extend(span.label) - - end_index = span.index[0] - input_tokens_ids.extend(tokens[start_index: end_index]) - input_tokens_ids.append(flag) - - # the next start index is the token after the last span token - start_index = span.index[-1] + 1 - - - # Add the remaining tokens to input_tokens_ids - input_tokens_ids.extend(tokens[start_index:]) - input_tokens_ids.append(eos_id) - # Add token to the output_tokens_ids - output_tokens_ids.append(eos_id) + max_ngrams = 3 + # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. + # To ensure that the input length is `max_seq_length`, we need to increase the maximum length + # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. + expanded_inputs_length, targets_length = compute_input_and_target_lengths( + max_seq_length, + masked_lm_prob, + max_ngrams + ) + + mask_indices = np.asarray([random_spans_noise_mask(expanded_inputs_length)]) + labels_mask = ~mask_indices + + input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8)) + labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8)) + + input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel) + output_tokens_ids = filter_input_ids(tokens, labels_sentinel) + + # # Masking. + # max_predictions_per_seq = masked_lm_prob * max_num_tokens + # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( + # tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + # cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, + # max_ngrams=max_ngrams, geometric_dist=True, masking_style="t5") + + # sentinel_tokens = collections.deque(sentinel_tokens) + # input_tokens_ids = [] + # output_tokens_ids = [] #[bos_id] + # (start_index, end_index) = (0, None) + # for span in masked_spans: + # flag = sentinel_tokens.popleft() + + # output_tokens_ids.append(flag) + # output_tokens_ids.extend(span.label) + + # end_index = span.index[0] + # input_tokens_ids.extend(tokens[start_index: end_index]) + # input_tokens_ids.append(flag) + + # # the next start index is the token after the last span token + # start_index = span.index[-1] + 1 + + + # # Add the remaining tokens to input_tokens_ids + # input_tokens_ids.extend(tokens[start_index:]) + # input_tokens_ids.append(eos_id) + # # Add token to the output_tokens_ids + # output_tokens_ids.append(eos_id) + prefix_len = len(input_tokens_ids) + print("input_tokens_ids") + print(input_tokens_ids) + print("output_tokens_ids") + print(output_tokens_ids) + text_tokens_ids = pad_and_convert_to_numpy( input_tokens_ids+output_tokens_ids, pad_id, @@ -318,4 +343,148 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): filler = np.array([pad_id] * padding_length) tokens_np = np.concatenate((tokens, filler), dtype=np.int64) - return tokens_np \ No newline at end of file + return tokens_np + + +def create_sentinel_ids(self, mask_indices): + """ + Sentinel ids creation given the indices that should be masked. + The start indices of each mask are replaced by the sentinel ids in increasing + order. Consecutive mask indices to be deleted are replaced with `-1`. + """ + start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices + start_indices[:, 0] = mask_indices[:, 0] + + sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices) + sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0) + sentinel_ids -= mask_indices - start_indices + + return sentinel_ids + + +def filter_input_ids(self, input_ids, sentinel_ids): + """ + Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. + This will reduce the sequence length from `expanded_inputs_length` to `input_length`. + """ + batch_size = input_ids.shape[0] + + input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids) + # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are + # masked tokens coming after sentinel tokens and should be removed + input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1)) + input_ids = np.concatenate( + [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1 + ) + return input_ids + + +def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length): + """This function is copy of `random_spans_helper `__ . + Training parameters to avoid padding with random_spans_noise_mask. + When training a model with random_spans_noise_mask, we would like to set the other + training hyperparmeters in a way that avoids padding. + This function helps us compute these hyperparameters. + We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens, + and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens. + This function tells us the required number of tokens in the raw example (for split_tokens()) + as well as the length of the encoded targets. Note that this function assumes + the inputs and targets will have EOS appended and includes that in the reported length. + Args: + inputs_length: an integer - desired length of the tokenized inputs sequence + noise_density: a float + mean_noise_span_length: a float + Returns: + tokens_length: length of original text in tokens + targets_length: an integer - length in tokens of encoded targets sequence + """ + + def _tokens_length_to_inputs_length_targets_length(tokens_length): + num_noise_tokens = int(round(tokens_length * noise_density)) + num_nonnoise_tokens = tokens_length - num_noise_tokens + num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length)) + # inputs contain all nonnoise tokens, sentinels for all noise spans + # and one EOS token. + _input_length = num_nonnoise_tokens + num_noise_spans + 1 + _output_length = num_noise_tokens + num_noise_spans + 1 + return _input_length, _output_length + + tokens_length = inputs_length + + while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length: + tokens_length += 1 + + inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(tokens_length) + + # minor hack to get the targets length to be equal to inputs length + # which is more likely to have been set to a nice round number. + if noise_density == 0.5 and targets_length > inputs_length: + tokens_length -= 1 + targets_length -= 1 + return tokens_length, targets_length + + +def random_spans_noise_mask( + length, + noise_density=0.15, + mean_noise_span_length=3 + ): + + """This function is copy of `random_spans_helper `__ . + Noise mask consisting of random spans of noise tokens. + The number of noise tokens and the number of noise spans and non-noise spans + are determined deterministically as follows: + num_noise_tokens = round(length * noise_density) + num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length) + Spans alternate between non-noise and noise, beginning with non-noise. + Subject to the above restrictions, all masks are equally likely. + Args: + length: an int32 scalar (length of the incoming token sequence) + noise_density: a float - approximate density of output mask + mean_noise_span_length: a number + Returns: + a boolean tensor with shape [length] + """ + + orig_length = length + + num_noise_tokens = int(np.round(length * noise_density)) + # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. + num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) + num_noise_spans = int(np.round(num_noise_tokens / mean_noise_span_length)) + + # avoid degeneracy by ensuring positive number of noise spans + num_noise_spans = max(num_noise_spans, 1) + num_nonnoise_tokens = length - num_noise_tokens + + # pick the lengths of the noise spans and the non-noise spans + def _random_segmentation(num_items, num_segments): + """Partition a sequence of items randomly into non-empty segments. + Args: + num_items: an integer scalar > 0 + num_segments: an integer scalar in [1, num_items] + Returns: + a Tensor with shape [num_segments] containing positive integers that add + up to num_items + """ + mask_indices = np.arange(num_items - 1) < (num_segments - 1) + np.random.shuffle(mask_indices) + first_in_segment = np.pad(mask_indices, [[1, 0]]) + segment_id = np.cumsum(first_in_segment) + # count length of sub segments assuming that list is sorted + _, segment_length = np.unique(segment_id, return_counts=True) + return segment_length + + noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans) + nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans) + + interleaved_span_lengths = np.reshape( + np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2] + ) + span_starts = np.cumsum(interleaved_span_lengths)[:-1] + span_start_indicator = np.zeros((length,), dtype=np.int8) + span_start_indicator[span_starts] = True + span_num = np.cumsum(span_start_indicator) + is_noise = np.equal(span_num % 2, 1) + + return is_noise[:orig_length] \ No newline at end of file diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index f9cec21ab..91e0ad1d6 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -17,7 +17,7 @@ from abc import ABC from abc import abstractmethod -from transformers import AutoTokenizer, AddedToken +from transformers import AutoTokenizer from .bert_tokenization import FullTokenizer as FullBertTokenizer from .gpt2_tokenization import GPT2Tokenizer @@ -331,51 +331,6 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} - self.tokenizer.add_special_tokens({ - 'additional_special_tokens': [ - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - AddedToken('', lstrip=False, rstrip=False, normalization=False), - ] - }) - @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens @@ -398,6 +353,10 @@ def tokenize(self, text): def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) + @property + def eod(self): + return self.tokenizer.eos_token_id + @property def cls(self): return self.tokenizer.cls_token_id @@ -410,10 +369,6 @@ def sep(self): def pad(self): return self.tokenizer.pad_token_id - @property - def eod(self): - return self.tokenizer.eos_token_id - @property def mask(self): return self.tokenizer.mask_token_id diff --git a/prepare_tokenizer.py b/prepare_tokenizer.py new file mode 100644 index 000000000..e058ac62a --- /dev/null +++ b/prepare_tokenizer.py @@ -0,0 +1,16 @@ +from transformers import AutoTokenizer, AddedToken + +tokenizer = AutoTokenizer.from_pretrained('bigscience/tokenizer') + +tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + AddedToken( + ''.format(str(idx).zfill(3)), + lstrip=False, + rstrip=False, + normalization=False + ) for idx in reversed(range(0,200)) + ] + }) + +tokenizer.save_pretrained('bigscience-tokenizer-padded') \ No newline at end of file diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py deleted file mode 100644 index b7af289a5..000000000 --- a/pretrain_mp3_gpt.py +++ /dev/null @@ -1,257 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Pretrain GPT""" - -import torch -from functools import partial -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import get_tokenizer -from megatron import mpu -# from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, build_dataset_group -from megatron.model import GPTModel, GPTModelPipe -from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ -from megatron.utils import average_losses_across_data_parallel_group - -import deepspeed -from deepspeed.runtime.utils import see_memory_usage -import subprocess - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - - args = get_args() - - with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config, - enabled=args.zero_stage == 3, - mpu=mpu): - if args.deepspeed: - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - prefix_lm=True - ) - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe - - else: - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - prefix_lm=True - ) - see_memory_usage(f"After Building Model", force=True) - return model - -_KEYS = ['text', 'prefix_len'] - -def get_batch(data_iterator): - """Generate a batch""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return tokens, labels, loss_mask, attention_mask, position_ids - -def get_batch_pipe(data): - """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and position ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices - -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - -def forward_step(data_iterator, model): - """Forward step.""" - args = get_args() - timers = get_timers() - - # Get the batch. - timers('batch-generator').start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - - output_tensor = model(tokens, position_ids, attention_mask, - labels=labels) - - return output_tensor, partial(loss_func, loss_mask) - - -def train_valid_test_datasets_provider(train_val_test_num_samples): - """Build train, valid, and test datasets.""" - args = get_args() - train_ds, valid_ds, test_ds = None, None, None - - print_rank_0('> building train, validation, and test datasets for GPT ...') - # Option 1 of data loading using --data-path - - if args.data_path: - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, - seed=args.seed, - skip_warmup=(not args.mmap_warmup)) - - # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - elif args.train_weighted_split_paths: - assigned_train_valid_test = [] - if args.train_weighted_split_paths is not None: - train_ds = [] - assigned_train_valid_test.append("train") - if args.valid_weighted_split_paths is not None: - valid_ds = [] - assigned_train_valid_test.append("valid") - if args.test_weighted_split_paths is not None: - test_ds = [] - assigned_train_valid_test.append("test") - - for s in assigned_train_valid_test: - data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - eval(f"args.{s}_weighted_split_weights"), - eval(f"args.{s}_weighted_split_splits"), - eval(f"args.{s}_weighted_split_names")) - for paths, weights, splits, name in data_groups: - d = build_dataset_group(name, paths, weights, splits, - args.data_impl, - train_val_test_num_samples, - args.seq_length, args.seed, - (not args.mmap_warmup), - train_valid_test=s) - eval(f"{s}_ds").append(d) - else: - raise NotImplementedError("No dataloading argument passed") - - print_rank_0("> finished creating GPT datasets ...") - return train_ds, valid_ds, test_ds - -def command_exists(cmd): - result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) - return result.wait() == 0 - -def git_ds_info(): - from deepspeed.env_report import main as ds_report - ds_report() - - # Write out version/git info - git_hash_cmd = "git rev-parse --short HEAD" - git_branch_cmd = "git rev-parse --abbrev-ref HEAD" - if command_exists('git'): - try: - result = subprocess.check_output(git_hash_cmd, shell=True) - git_hash = result.decode('utf-8').strip() - result = subprocess.check_output(git_branch_cmd, shell=True) - git_branch = result.decode('utf-8').strip() - except subprocess.CalledProcessError: - git_hash = "unknown" - git_branch = "unknown" - else: - git_hash = "unknown" - git_branch = "unknown" - print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') - - -if __name__ == "__main__": - git_ds_info() - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 4c0ca2e1d828330888b1bd9c093f38e8e409d567 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:40:15 +0700 Subject: [PATCH 104/297] minor fix --- 4B8-en-ND-MLM.sh | 1 + prepare_tokenizer.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index c8e1ba0d6..f0a5f59fb 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,6 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 +SEQ_LEN=INPUT_LEN+TARGET_LEN NLAYERS=24 NHIDDEN=4096 diff --git a/prepare_tokenizer.py b/prepare_tokenizer.py index e058ac62a..280ba458d 100644 --- a/prepare_tokenizer.py +++ b/prepare_tokenizer.py @@ -13,4 +13,13 @@ ] }) -tokenizer.save_pretrained('bigscience-tokenizer-padded') \ No newline at end of file +tokenizer.save_pretrained('bigscience-tokenizer-padded') + +# python tools/preprocess_data.py \ +# --input data/oscar-en-10k.jsonl \ +# --output-prefix data/meg-gpt2-oscar-en-10k \ +# --dataset-impl mmap \ +# --tokenizer-type PretrainedFromHF \ +# --tokenizer-name-or-path bigscience-tokenizer-padded \ +# --append-eod \ +# --workers 4 \ No newline at end of file From 0c05596d3c1ec8f6c4fab9f8a7f1b7965893fae7 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:41:28 +0700 Subject: [PATCH 105/297] minor fix --- 4B8-en-ND-MLM.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index f0a5f59fb..e2317f119 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=INPUT_LEN+TARGET_LEN +SEQ_LEN=$(INPUT_LEN+TARGET_LEN) NLAYERS=24 NHIDDEN=4096 From 30f69248ea7bd2d58e7db3644b9afbf4733da03b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:42:10 +0700 Subject: [PATCH 106/297] minor fix --- 4B8-en-ND-MLM.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index e2317f119..bbc5be97d 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,13 +22,13 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=$(INPUT_LEN+TARGET_LEN) +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 NHEADS=64 FFN_HIDDEN_SIZE=10240 -MAX_POSITION_EMBEDDING=1280 + SAVE_INTERVAL=1500 From 84408ef002fc206e736bc215434860ce98b59bba Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:43:47 +0700 Subject: [PATCH 107/297] minor fix --- 4B8-en-ND-MLM.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index bbc5be97d..77f91de81 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -150,8 +150,7 @@ mkdir -p $REPO_PATH mkdir -p $LOGS_PATH # to debug - add echo (it exits and prints what it would have launched) -python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ +deepspeed --num_gpus $GPUS_PER_NODE \ $CMD # srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file From ad964c5814f69b8542b55ec8f6fdb66b929d1cd2 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:47:38 +0700 Subject: [PATCH 108/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ed633fb32..4e12b59ee 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -278,8 +278,8 @@ def build_training_sample(sample, target_seq_length, input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8)) - input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel) - output_tokens_ids = filter_input_ids(tokens, labels_sentinel) + input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) + output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens @@ -346,7 +346,7 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): return tokens_np -def create_sentinel_ids(self, mask_indices): +def create_sentinel_ids(mask_indices): """ Sentinel ids creation given the indices that should be masked. The start indices of each mask are replaced by the sentinel ids in increasing @@ -356,13 +356,13 @@ def create_sentinel_ids(self, mask_indices): start_indices[:, 0] = mask_indices[:, 0] sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices) - sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0) + sentinel_ids = np.where(sentinel_ids != 0, (len(tokenizer) - sentinel_ids), 0) sentinel_ids -= mask_indices - start_indices return sentinel_ids -def filter_input_ids(self, input_ids, sentinel_ids): +def filter_input_ids(input_ids, sentinel_ids, eos_id): """ Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. This will reduce the sequence length from `expanded_inputs_length` to `input_length`. @@ -374,7 +374,7 @@ def filter_input_ids(self, input_ids, sentinel_ids): # masked tokens coming after sentinel tokens and should be removed input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1)) input_ids = np.concatenate( - [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1 + [input_ids, np.full((batch_size, 1), eos_id, dtype=np.int32)], axis=-1 ) return input_ids From 45899e98bb5360ec37d3f9f5a52aa896c8c8424b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:48:05 +0700 Subject: [PATCH 109/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 4e12b59ee..0d771c58d 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -319,6 +319,9 @@ def build_training_sample(sample, target_seq_length, print("output_tokens_ids") print(output_tokens_ids) + import sys + sys.exit() + text_tokens_ids = pad_and_convert_to_numpy( input_tokens_ids+output_tokens_ids, pad_id, From 0b94597241b7ae3a885465cbb9fe485bd32cb6dd Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:52:27 +0700 Subject: [PATCH 110/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0d771c58d..8ed22f234 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -275,8 +275,8 @@ def build_training_sample(sample, target_seq_length, mask_indices = np.asarray([random_spans_noise_mask(expanded_inputs_length)]) labels_mask = ~mask_indices - input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8)) - labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8)) + input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) + labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) @@ -349,7 +349,7 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): return tokens_np -def create_sentinel_ids(mask_indices): +def create_sentinel_ids(mask_indices, vocab_len): """ Sentinel ids creation given the indices that should be masked. The start indices of each mask are replaced by the sentinel ids in increasing @@ -359,7 +359,7 @@ def create_sentinel_ids(mask_indices): start_indices[:, 0] = mask_indices[:, 0] sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices) - sentinel_ids = np.where(sentinel_ids != 0, (len(tokenizer) - sentinel_ids), 0) + sentinel_ids = np.where(sentinel_ids != 0, (vocab_len - sentinel_ids), 0) sentinel_ids -= mask_indices - start_indices return sentinel_ids From 2b54cc1720c1166f2babe117905c7d2df65a9127 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:54:29 +0700 Subject: [PATCH 111/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 8ed22f234..641162050 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -278,6 +278,7 @@ def build_training_sample(sample, target_seq_length, input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) + tokens = np.asarray(tokens) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) From ec616272b4c88582e0a48f599b563040c94676bf Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:05:02 +0700 Subject: [PATCH 112/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 26 ++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 641162050..27d0c1a4e 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -278,7 +278,19 @@ def build_training_sample(sample, target_seq_length, input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) - tokens = np.asarray(tokens) + + if len(tokens) < expanded_inputs_length: + tokens = pad_and_convert_to_numpy( + tokens, + pad_id, + expanded_inputs_length + ) + + tokens = np.asarray([tokens]) + print("input_ids_sentinel.shape") + print(input_ids_sentinel.shape) + print("tokens.shape") + print(tokens.shape) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) @@ -313,6 +325,12 @@ def build_training_sample(sample, target_seq_length, # # Add token to the output_tokens_ids # output_tokens_ids.append(eos_id) + # text_tokens_ids = pad_and_convert_to_numpy( + # input_tokens_ids+output_tokens_ids, + # pad_id, + # max_seq_length+max_seq_length_dec + # ) + prefix_len = len(input_tokens_ids) print("input_tokens_ids") @@ -323,12 +341,6 @@ def build_training_sample(sample, target_seq_length, import sys sys.exit() - text_tokens_ids = pad_and_convert_to_numpy( - input_tokens_ids+output_tokens_ids, - pad_id, - max_seq_length+max_seq_length_dec - ) - return { 'text': text_tokens_ids, 'prefix_len': prefix_len From 4448d1d39701b8195db088829c861fac6db6d73a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:13:58 +0700 Subject: [PATCH 113/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 27d0c1a4e..371479e22 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -287,10 +287,6 @@ def build_training_sample(sample, target_seq_length, ) tokens = np.asarray([tokens]) - print("input_ids_sentinel.shape") - print(input_ids_sentinel.shape) - print("tokens.shape") - print(tokens.shape) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) @@ -334,9 +330,9 @@ def build_training_sample(sample, target_seq_length, prefix_len = len(input_tokens_ids) print("input_tokens_ids") - print(input_tokens_ids) + print(len(input_tokens_ids)) print("output_tokens_ids") - print(output_tokens_ids) + print(len(output_tokens_ids)) import sys sys.exit() From ecd148c789cb9cf9678cc923060dd9eb206217c7 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:15:43 +0700 Subject: [PATCH 114/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 371479e22..7bf16ce03 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -287,8 +287,8 @@ def build_training_sample(sample, target_seq_length, ) tokens = np.asarray([tokens]) - input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id) - output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id) + input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] + output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens From a99f30f038c0be6ee2c38680d2ed70e08b815e9e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:29:31 +0700 Subject: [PATCH 115/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 7bf16ce03..f709a9a8c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -181,7 +181,8 @@ def __init__(self, name, indexed_dataset, data_prefix, data_prefix, num_epochs, max_num_samples, - self.max_seq_length - 2, # account for added tokens + # self.max_seq_length - 2, # account for added tokens + self.max_seq_length*2, short_seq_prob, self.seed, self.name, @@ -329,14 +330,6 @@ def build_training_sample(sample, target_seq_length, prefix_len = len(input_tokens_ids) - print("input_tokens_ids") - print(len(input_tokens_ids)) - print("output_tokens_ids") - print(len(output_tokens_ids)) - - import sys - sys.exit() - return { 'text': text_tokens_ids, 'prefix_len': prefix_len From 62d3e3e9cbdd3817f4d91b4044134ab1714f3339 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:31:09 +0700 Subject: [PATCH 116/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index f709a9a8c..ddbc1333c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -291,6 +291,13 @@ def build_training_sample(sample, target_seq_length, input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] + print("input_tokens_ids") + print(len(input_tokens_ids)) + print(input_tokens_ids) + print("output_tokens_ids") + print(len(output_tokens_ids)) + print(output_tokens_ids) + # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( From a160853196afbe397169240d795b0c9964cb9df9 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:42:18 +0700 Subject: [PATCH 117/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ddbc1333c..64a474a18 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -181,8 +181,7 @@ def __init__(self, name, indexed_dataset, data_prefix, data_prefix, num_epochs, max_num_samples, - # self.max_seq_length - 2, # account for added tokens - self.max_seq_length*2, + self.max_seq_length - 2, # account for added tokens short_seq_prob, self.seed, self.name, @@ -210,6 +209,14 @@ def __getitem__(self, idx): sample = [] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + + #concat more to avoid padding + for i in range(0,2): + _idx = random.randint(idx, self.__len__) + start_index, end_index, seq_length = self.samples_mapping[_idx] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) + # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) @@ -335,6 +342,9 @@ def build_training_sample(sample, target_seq_length, # max_seq_length+max_seq_length_dec # ) + text_tokens_ids = input_tokens_ids+output_tokens_ids + print*text_tokens_ids + prefix_len = len(input_tokens_ids) return { From fe205f7784ad266c4813be8ed270931945d23f4d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:43:53 +0700 Subject: [PATCH 118/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 64a474a18..ad7c5dc35 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -17,6 +17,7 @@ import os import time +import random import collections import numpy as np From d39bdaf9413328741d002ae23e11c7bf25181bfc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 20:45:16 +0700 Subject: [PATCH 119/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ad7c5dc35..1eff6762e 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -213,7 +213,7 @@ def __getitem__(self, idx): #concat more to avoid padding for i in range(0,2): - _idx = random.randint(idx, self.__len__) + _idx = random.randint(idx, self.__len__()) start_index, end_index, seq_length = self.samples_mapping[_idx] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) From 2530d3e08649f70adee71d771cafb94b6a3f21fd Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:42:55 +0700 Subject: [PATCH 120/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 1eff6762e..286c2a3d8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -212,12 +212,14 @@ def __getitem__(self, idx): sample.append(self.indexed_dataset[index]) #concat more to avoid padding - for i in range(0,2): + while seq_length < (self.max_seq_length/self.masked_lm_prob): _idx = random.randint(idx, self.__len__()) - start_index, end_index, seq_length = self.samples_mapping[_idx] + start_index, end_index, _seq_length = self.samples_mapping[_idx] for index in range(start_index, end_index): sample.append(self.indexed_dataset[index]) + seq_length += _seq_length + # Note that this rng state should be numpy and not python since # python randint is inclusive whereas the numpy one is exclusive. np_rng = np.random.RandomState(seed=(self.seed + idx)) From 5e93c47ae2f56f1ae6206af5c32692148ec6ec70 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:47:06 +0700 Subject: [PATCH 121/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 286c2a3d8..cfd8730d5 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -263,15 +263,15 @@ def build_training_sample(sample, target_seq_length, sentinel_tokens: unique value to be substituted for every replaced span """ - assert target_seq_length <= max_seq_length + # assert target_seq_length <= max_seq_length # flatten sentences into one list tokens = [token for sentence in sample for token in sentence] - # Truncate to `target_sequence_length`. - max_num_tokens = target_seq_length - truncated = len(tokens) > max_num_tokens - tokens = tokens[:max_num_tokens] + # # Truncate to `target_sequence_length`. + # max_num_tokens = target_seq_length + # truncated = len(tokens) > max_num_tokens + # tokens = tokens[:max_num_tokens] max_ngrams = 3 # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. From ad867998f162912a38991dbb10d74ad95ac344ac Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:49:16 +0700 Subject: [PATCH 122/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index cfd8730d5..161af4885 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -290,12 +290,14 @@ def build_training_sample(sample, target_seq_length, labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) - if len(tokens) < expanded_inputs_length: + if len(tokens) <= expanded_inputs_length: tokens = pad_and_convert_to_numpy( tokens, pad_id, expanded_inputs_length ) + else: + tokens = tokens[:expanded_inputs_length] tokens = np.asarray([tokens]) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] From 82c8d932a1790a6b9c072de31a8f5f4914640c8d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:51:42 +0700 Subject: [PATCH 123/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 161af4885..3920b9d95 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -347,8 +347,7 @@ def build_training_sample(sample, target_seq_length, # max_seq_length+max_seq_length_dec # ) - text_tokens_ids = input_tokens_ids+output_tokens_ids - print*text_tokens_ids + text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) prefix_len = len(input_tokens_ids) From ebf3561d0aabe1ca1c10f00e46c804bf5c75bdbc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 21:56:08 +0700 Subject: [PATCH 124/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 3920b9d95..59a1f2ee5 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -303,13 +303,6 @@ def build_training_sample(sample, target_seq_length, input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] - print("input_tokens_ids") - print(len(input_tokens_ids)) - print(input_tokens_ids) - print("output_tokens_ids") - print(len(output_tokens_ids)) - print(output_tokens_ids) - # # Masking. # max_predictions_per_seq = masked_lm_prob * max_num_tokens # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( @@ -348,6 +341,9 @@ def build_training_sample(sample, target_seq_length, # ) text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) + print("text_tokens_ids") + print(len(text_tokens_ids)) + print(text_tokens_ids) prefix_len = len(input_tokens_ids) From 811f97559e9f8632be23c8b66fcd79485d3526cf Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 22:51:52 +0700 Subject: [PATCH 125/297] minor fix --- megatron/data/non_causal_mlm_dataset.py | 3 --- train_ND_MLM_gpt.py | 10 ---------- train_ND_MTF_gpt.py | 10 ---------- 3 files changed, 23 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 59a1f2ee5..2df7dcfd3 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -341,9 +341,6 @@ def build_training_sample(sample, target_seq_length, # ) text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) - print("text_tokens_ids") - print(len(text_tokens_ids)) - print(text_tokens_ids) prefix_len = len(input_tokens_ids) diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 0326e778a..5ba98cd11 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -112,16 +112,6 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) - import numpy as np - with open('attention_mask.npy', 'wb') as f: - np.save(f, attention_mask.cpu().numpy()) - with open('loss_mask.npy', 'wb') as f: - np.save(f, loss_mask.cpu().numpy()) - with open('position_ids.npy', 'wb') as f: - np.save(f, position_ids.cpu().numpy()) - import sys - sys.exit() - # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) diff --git a/train_ND_MTF_gpt.py b/train_ND_MTF_gpt.py index d16c9bb82..69b8c825b 100644 --- a/train_ND_MTF_gpt.py +++ b/train_ND_MTF_gpt.py @@ -112,16 +112,6 @@ def get_batch(data_iterator): loss_on_targets_only=args.loss_on_targets_only ) - import numpy as np - with open('attention_mask.npy', 'wb') as f: - np.save(f, attention_mask.cpu().numpy()) - with open('loss_mask.npy', 'wb') as f: - np.save(f, loss_mask.cpu().numpy()) - with open('position_ids.npy', 'wb') as f: - np.save(f, position_ids.cpu().numpy()) - import sys - sys.exit() - # weight loss_mask if args.reweight_loss_based_on_position_frequency: reweight_loss_mask_(loss_mask, tokens) From de7dfc83090537f02cd48f8982491fdbd689c599 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 23:57:41 +0700 Subject: [PATCH 126/297] minor fix --- 4B8-en-ND-MLM.sh | 2 -- train_ND_MLM_gpt.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 77f91de81..3af4ed0bc 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -55,8 +55,6 @@ GPT_ARGS=" \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ - --encoder-seq-length $INPUT_LEN \ - --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 5ba98cd11..148d287bb 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,8 +197,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, - max_seq_length_dec=args.decoder_seq_length, + max_seq_length=512, #args.encoder_seq_length, + max_seq_length_dec=114, #args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From be2af770de7c55f333851c965a62cdfc92f53087 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:01:37 +0700 Subject: [PATCH 127/297] minor fix --- 4B8-en-ND-MLM.sh | 3 ++- train_ND_MLM_gpt.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 3af4ed0bc..3de57d8ad 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=$INPUT_LEN #$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 @@ -55,6 +55,7 @@ GPT_ARGS=" \ --ffn-hidden-size $FFN_HIDDEN_SIZE \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ + --seq-length $SEQ_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 148d287bb..80789f9ef 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,8 +197,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=512, #args.encoder_seq_length, - max_seq_length_dec=114, #args.decoder_seq_length, + max_seq_length=args.seq_length, + max_seq_length_dec=args.seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 5e7e18f4a095467d167c4cdf3f909f7a83e94908 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:04:55 +0700 Subject: [PATCH 128/297] minor fix --- 4B8-en-ND-MLM.sh | 2 +- train_ND_MLM_gpt.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 3de57d8ad..969bba02a 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 INPUT_LEN=512 TARGET_LEN=114 -SEQ_LEN=$INPUT_LEN #$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 80789f9ef..8a254cee0 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,8 +197,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length, - max_seq_length_dec=args.seq_length, + max_seq_length=512, #args.seq_length, + max_seq_length_dec=114, #args.seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 24d4f25dd647f5c8a63cca0e48011fe057d06fe5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:23:21 +0700 Subject: [PATCH 129/297] minor fix --- 4B8-en-ND-MLM.sh | 4 +--- train_ND_MLM_gpt.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 969bba02a..53e3d9d01 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -20,9 +20,7 @@ DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatic MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 -INPUT_LEN=512 -TARGET_LEN=114 -SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=512 NLAYERS=24 NHIDDEN=4096 diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 8a254cee0..e286d1e0d 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -23,7 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, compute_input_and_target_lengths #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ @@ -192,13 +192,18 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: + + extended_seq_length, target_length = compute_input_and_target_lengths(args.seq_length, args.mask_prob, 3) + args.max_position_embeddings = extended_seq_length + args.seq_length = extended_seq_length + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=512, #args.seq_length, - max_seq_length_dec=114, #args.seq_length, + max_seq_length=extended_seq_length, + max_seq_length_dec=target_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 5926be1c0f326964b98cdb613c276e2c769be1b1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:26:58 +0700 Subject: [PATCH 130/297] minor fix --- train_ND_MLM_gpt.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index e286d1e0d..8765e05f8 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -194,15 +194,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): if args.data_path: extended_seq_length, target_length = compute_input_and_target_lengths(args.seq_length, args.mask_prob, 3) - args.max_position_embeddings = extended_seq_length - args.seq_length = extended_seq_length train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=extended_seq_length, + max_seq_length=args.seq_length, max_seq_length_dec=target_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, From 0f18174c533f407c57313793de0b3090f64ecdd1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:34:30 +0700 Subject: [PATCH 131/297] minor fix --- 4B8-en-ND-MLM.sh | 8 ++++++-- train_ND_MLM_gpt.py | 9 +++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 53e3d9d01..58247e9e4 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -20,7 +20,9 @@ DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatic MICRO_BATCH_SIZE=1 GLOBAL_BATCH_SIZE=2048 TRAIN_ITER=39_718 -SEQ_LEN=512 +INPUT_LEN=512 +TARGET_LEN=114 +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 @@ -54,6 +56,8 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ + --encoder-seq-length $INPUT_LEN \ + --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ @@ -68,7 +72,7 @@ GPT_ARGS=" \ " OUTPUT_ARGS=" \ - --log-interval 200 \ + --log-interval 1 \ --save-interval $SAVE_INTERVAL \ --eval-interval $TRAIN_ITER \ --eval-iters 1 \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 8765e05f8..5ba98cd11 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -23,7 +23,7 @@ from megatron import get_tokenizer from megatron import mpu -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets, compute_input_and_target_lengths #, build_dataset_group +from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ @@ -192,16 +192,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path if args.data_path: - - extended_seq_length, target_length = compute_input_and_target_lengths(args.seq_length, args.mask_prob, 3) - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length, - max_seq_length_dec=target_length, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, seed=args.seed, From 58ce71440290ee62dcb7215a32705179ba5c2fb7 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:42:02 +0700 Subject: [PATCH 132/297] minor fix --- 4B8-en-ND-MLM.sh | 1 - train_ND_MLM_gpt.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 58247e9e4..4d0be40cd 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -56,7 +56,6 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ - --encoder-seq-length $INPUT_LEN \ --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 5ba98cd11..a83ded877 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, + max_seq_length=args.seq_length-args.decoder_seq_length, max_seq_length_dec=args.decoder_seq_length, masked_lm_prob=args.mask_prob, short_seq_prob=args.short_seq_prob, From 05470d7c89ea223819e287586e9a019d1eb7dc7b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 3 Jun 2022 00:48:21 +0700 Subject: [PATCH 133/297] set correct seq len --- 4B8-en-ND-MLM.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 4d0be40cd..949a84ed8 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -18,10 +18,10 @@ TP_SIZE=1 # always fixed to the size of a single node DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=2048 -TRAIN_ITER=39_718 -INPUT_LEN=512 -TARGET_LEN=114 +GLOBAL_BATCH_SIZE=512 +TRAIN_ITER=48_562 +INPUT_LEN=1675 +TARGET_LEN=373 SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 From 51a23f23a392d76b71322b7ae07696c318b53866 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 19:14:31 +0700 Subject: [PATCH 134/297] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 247 +++++++++++++----------- 1 file changed, 130 insertions(+), 117 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 2df7dcfd3..f36730c64 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -25,7 +25,7 @@ from megatron import mpu, print_rank_0, get_tokenizer from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_samples_mapping, create_masked_lm_predictions +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, create_masked_lm_predictions from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_, get_indexed_dataset_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset @@ -177,16 +177,23 @@ def __init__(self, name, indexed_dataset, data_prefix, # Dataset. self.indexed_dataset = indexed_dataset + max_ngrams = 3 + # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. + # To ensure that the input length is `max_seq_length`, we need to increase the maximum length + # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. + expanded_inputs_length, targets_length = compute_input_and_target_lengths( + self.max_seq_length, + self.masked_lm_prob, + max_ngrams + ) + # Build the samples mapping. - self.samples_mapping = get_samples_mapping(self.indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - self.max_seq_length - 2, # account for added tokens - short_seq_prob, - self.seed, - self.name, - False) + self.samples_mapping = get_samples_mapping( + self.indexed_dataset, + data_prefix, + self.name, + max_len=expanded_inputs_length + ) # Vocab stuff. tokenizer = get_tokenizer() @@ -202,88 +209,40 @@ def __init__(self, name, indexed_dataset, data_prefix, assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" def __len__(self): - return self.samples_mapping.shape[0] + return len(self.samples_mapping) def __getitem__(self, idx): - start_index, end_index, seq_length = self.samples_mapping[idx] + indices = self.samples_mapping[idx] sample = [] - for index in range(start_index, end_index): - sample.append(self.indexed_dataset[index]) - - #concat more to avoid padding - while seq_length < (self.max_seq_length/self.masked_lm_prob): - _idx = random.randint(idx, self.__len__()) - start_index, end_index, _seq_length = self.samples_mapping[_idx] - for index in range(start_index, end_index): - sample.append(self.indexed_dataset[index]) - - seq_length += _seq_length - - # Note that this rng state should be numpy and not python since - # python randint is inclusive whereas the numpy one is exclusive. - np_rng = np.random.RandomState(seed=(self.seed + idx)) - return build_training_sample(sample, seq_length, - self.max_seq_length, # needed for padding - self.max_seq_length_dec, - self.vocab_id_list, - self.vocab_id_to_token_dict, - self.cls_id, self.sep_id, - self.mask_id, self.pad_id, - self.masked_lm_prob, np_rng, - self.bos_id, self.eos_id, - self.sentinel_tokens) - - -def build_training_sample(sample, target_seq_length, - max_seq_length, max_seq_length_dec, - vocab_id_list, vocab_id_to_token_dict, - cls_id, sep_id, mask_id, pad_id, - masked_lm_prob, np_rng, bos_id=None, - eos_id=None, sentinel_tokens=None): + for doc_idx, start_index, end_index in indices: + sample.append(self.indexed_dataset[index][start_index:end_index]) + + return build_training_sample( + sample, expanded_inputs_length, self.vocab_id_list, + self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.bos_id, self.eos_id, + self.sentinel_tokens + ) + + +def build_training_sample( + sample, expanded_inputs_length, vocab_id_list, + cls_id, sep_id, mask_id, pad_id, bos_id=None, eos_id=None, sentinel_tokens=None + ): """Build training sample. Arguments: - sample: A list of sentences in which each sentence is a list token ids. - target_seq_length: Desired sequence length. - max_seq_length: Maximum length of the sequence. All values are padded to - this length. - vocab_id_list: List of vocabulary ids. Used to pick a random id. - vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. - cls_id: Start of example id. - sep_id: Separator id. - mask_id: Mask token id. - pad_id: Padding token id. - masked_lm_prob: Probability to mask tokens. - np_rng: Random number genenrator. Note that this rng state should be - numpy and not python since python randint is inclusive for - the opper bound whereas the numpy one is exclusive. - bos_id: start of decoder example id - eos_id: end of generation id - sentinel_tokens: unique value to be substituted for every replaced span + TODO: Add description """ - # assert target_seq_length <= max_seq_length - # flatten sentences into one list tokens = [token for sentence in sample for token in sentence] - # # Truncate to `target_sequence_length`. - # max_num_tokens = target_seq_length - # truncated = len(tokens) > max_num_tokens - # tokens = tokens[:max_num_tokens] - - max_ngrams = 3 - # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. - # To ensure that the input length is `max_seq_length`, we need to increase the maximum length - # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. - expanded_inputs_length, targets_length = compute_input_and_target_lengths( - max_seq_length, - masked_lm_prob, - max_ngrams - ) - - mask_indices = np.asarray([random_spans_noise_mask(expanded_inputs_length)]) + mask_indices = np.asarray([random_spans_noise_mask( + expanded_inputs_length, + noise_density=0.15, + mean_noise_span_length=3 + )]) labels_mask = ~mask_indices input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) @@ -303,43 +262,6 @@ def build_training_sample(sample, target_seq_length, input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] - # # Masking. - # max_predictions_per_seq = masked_lm_prob * max_num_tokens - # (tokens, masked_positions, masked_labels, _, masked_spans) = create_masked_lm_predictions( - # tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, - # cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, - # max_ngrams=max_ngrams, geometric_dist=True, masking_style="t5") - - # sentinel_tokens = collections.deque(sentinel_tokens) - # input_tokens_ids = [] - # output_tokens_ids = [] #[bos_id] - # (start_index, end_index) = (0, None) - # for span in masked_spans: - # flag = sentinel_tokens.popleft() - - # output_tokens_ids.append(flag) - # output_tokens_ids.extend(span.label) - - # end_index = span.index[0] - # input_tokens_ids.extend(tokens[start_index: end_index]) - # input_tokens_ids.append(flag) - - # # the next start index is the token after the last span token - # start_index = span.index[-1] + 1 - - - # # Add the remaining tokens to input_tokens_ids - # input_tokens_ids.extend(tokens[start_index:]) - # input_tokens_ids.append(eos_id) - # # Add token to the output_tokens_ids - # output_tokens_ids.append(eos_id) - - # text_tokens_ids = pad_and_convert_to_numpy( - # input_tokens_ids+output_tokens_ids, - # pad_id, - # max_seq_length+max_seq_length_dec - # ) - text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) prefix_len = len(input_tokens_ids) @@ -350,6 +272,97 @@ def build_training_sample(sample, target_seq_length, } +def get_samples_mapping(indexed_dataset, data_prefix, name, max_len=568): + + def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): + + if idx_list is None: + idx_list = [] + + if idx_offset is None: + idx_offset = 0 + + if sample_len < max_len: + idx_list.append(idx_offset+sample_len) + else: + sample_len = sample_len - max_len + idx_list.append(idx_offset+max_len) + idx_offset += max_len + + breakdown(sample_len, idx_offset=idx_offset, idx_list=idx_list) + + idx_list = [0]+idx_list + return list(zip(idx_list[:-1], idx_list[1:])) + + + # Filename of the index mapping + indexmap_filename = data_prefix + indexmap_filename += '_{}_indexmap'.format(name) + indexmap_filename += '.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0 and \ + not os.path.isfile(indexmap_filename): + + samples_mapping = [] + sample_indices = [] + doc_idx = 0 + current_len = 0 + _idx = 0 + for doc_idx, sample_len in zip(indexed_dataset.doc_idx, indexed_dataset.sizes): + _idx = 0 + + if current_len + sample_len > max_len: + end_idx = max_len - current_len + sample_indices.append([doc_idx, 0, end_idx]) + samples_mapping.append(sample_indices) + sample_indices = [] + current_len = 0 + sample_len -= end_idx + _idx = end_idx + + break_len = current_len + sample_len + + indices = breakdown(sample_len) + for _start_idx, _end_idx in indices: + _len = _end_idx - _start_idx + if _len == max_len: + samples_mapping.append([[doc_idx, _start_idx+_idx, _end_idx+_idx]]) + else: + sample_indices.append([doc_idx, _start_idx+_idx, _end_idx+_idx]) + current_len += _len + + print_rank_0(' > done building sapmles index maping') + np.save(indexmap_filename, samples_mapping, allow_pickle=True) + print_rank_0(' > saved the index mapping in {}'.format( + indexmap_filename)) + # Make sure all the ranks have built the mapping + print_rank_0(' > elasped time to build and save samples mapping ' + '(seconds): {:4f}'.format( + time.time() - start_time)) + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load indexed dataset. + print_rank_0(' > loading indexed mapping from {}'.format( + indexmap_filename)) + start_time = time.time() + samples_mapping = np.load(indexmap_filename, allow_pickle=True) + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + print_rank_0(' total number of samples: {}'.format( + len(samples_mapping))) + + return samples_mapping + + def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): """Pad sequences and convert them to numpy.""" From 43cb2f046fc74bcbc3b3454b2ee81df039595451 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 19:18:25 +0700 Subject: [PATCH 135/297] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index f36730c64..e37a1981c 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -303,7 +303,18 @@ def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0 and \ not os.path.isfile(indexmap_filename): - + print(' > WARNING: could not find index map file {}, building ' + 'the indices on rank 0 ...'.format(indexmap_filename)) + + # Make sure the types match the helpers input types. + assert indexed_dataset.doc_idx.dtype == np.int64 + assert indexed_dataset.sizes.dtype == np.int32 + + # Build samples mapping + verbose = torch.distributed.get_rank() == 0 + start_time = time.time() + print_rank_0(' > building sapmles index mapping for {} ...'.format( + name)) samples_mapping = [] sample_indices = [] doc_idx = 0 From 901defc8e325228e329f34a11c6d099b40a094b8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 19:19:34 +0700 Subject: [PATCH 136/297] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index e37a1981c..cb2ee5b34 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -216,7 +216,7 @@ def __getitem__(self, idx): indices = self.samples_mapping[idx] sample = [] for doc_idx, start_index, end_index in indices: - sample.append(self.indexed_dataset[index][start_index:end_index]) + sample.append(self.indexed_dataset[doc_idx][start_index:end_index]) return build_training_sample( sample, expanded_inputs_length, self.vocab_id_list, From 3130d7d1bc86427144ff4bf29addc44149dad1bc Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 21:28:31 +0700 Subject: [PATCH 137/297] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index cb2ee5b34..7c8ca01a0 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -216,7 +216,7 @@ def __getitem__(self, idx): indices = self.samples_mapping[idx] sample = [] for doc_idx, start_index, end_index in indices: - sample.append(self.indexed_dataset[doc_idx][start_index:end_index]) + sample.append(self.indexed_dataset.get(doc_idx)[start_index:end_index]) return build_training_sample( sample, expanded_inputs_length, self.vocab_id_list, From 18eb53d7171aabcaecfc52bdd8c26a4fc85c8337 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 21:30:30 +0700 Subject: [PATCH 138/297] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 7c8ca01a0..0c6213fa8 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -186,6 +186,8 @@ def __init__(self, name, indexed_dataset, data_prefix, self.masked_lm_prob, max_ngrams ) + self.expanded_inputs_length = expanded_inputs_length + self.targets_length = targets_length # Build the samples mapping. self.samples_mapping = get_samples_mapping( @@ -219,7 +221,7 @@ def __getitem__(self, idx): sample.append(self.indexed_dataset.get(doc_idx)[start_index:end_index]) return build_training_sample( - sample, expanded_inputs_length, self.vocab_id_list, + sample, self.expanded_inputs_length, self.vocab_id_list, self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.bos_id, self.eos_id, self.sentinel_tokens ) From 652c545cec86aedc617c8147be70af4bdd130c67 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 21:36:21 +0700 Subject: [PATCH 139/297] refined sampling method --- megatron/data/non_causal_mlm_dataset.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 0c6213fa8..9289f6b51 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -250,16 +250,6 @@ def build_training_sample( input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) - - if len(tokens) <= expanded_inputs_length: - tokens = pad_and_convert_to_numpy( - tokens, - pad_id, - expanded_inputs_length - ) - else: - tokens = tokens[:expanded_inputs_length] - tokens = np.asarray([tokens]) input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] @@ -385,7 +375,7 @@ def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): assert padding_length >= 0 # Tokens and token types. - filler = np.array([pad_id] * padding_length) + filler = np.array([pad_id] * padding_length, dtype=np.int64) tokens_np = np.concatenate((tokens, filler), dtype=np.int64) return tokens_np From 5a49db8ed8f2d27869fd6ae34a54f7aaeb75a41d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:08:16 +0700 Subject: [PATCH 140/297] first commit, adding non causal mlm dataset --- 4B8-en-ND-MLM.sh | 3 +- megatron/data/non_causal_mlm_dataset.py | 50 ++++++------------------- train_ND_MLM_gpt.py | 6 +-- 3 files changed, 15 insertions(+), 44 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 949a84ed8..a856f2e77 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=512 TRAIN_ITER=48_562 INPUT_LEN=1675 TARGET_LEN=373 -SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) +SEQ_LEN=$INPUT_LEN NLAYERS=24 NHIDDEN=4096 @@ -56,7 +56,6 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ - --decoder-seq-length $TARGET_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index 9289f6b51..bab07ceba 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -33,20 +33,16 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, max_seq_length, - masked_lm_prob, short_seq_prob, seed, - skip_warmup, binary_head=False, - max_seq_length_dec=None, - dataset_type='standard_bert'): + masked_lm_prob, seed, + skip_warmup + ): if len(data_prefix) == 1: return _build_train_valid_test_datasets(data_prefix[0], data_impl, splits_string, train_valid_test_num_samples, max_seq_length, masked_lm_prob, - short_seq_prob, seed, - skip_warmup, - binary_head, - max_seq_length_dec, - dataset_type=dataset_type) + seed, skip_warmup + ) # Blending dataset. # Parse the values. output = get_datasets_weights_and_num_samples(data_prefix, @@ -61,8 +57,8 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( prefixes[i], data_impl, splits_string, datasets_train_valid_test_num_samples[i], - max_seq_length, masked_lm_prob, short_seq_prob, - seed, skip_warmup, binary_head, dataset_type=dataset_type) + max_seq_length, masked_lm_prob, + seed, skip_warmup) if train_ds: train_datasets.append(train_ds) if valid_ds: @@ -87,11 +83,8 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, - max_seq_length, - masked_lm_prob, short_seq_prob, seed, - skip_warmup, binary_head, - max_seq_length_dec, - dataset_type='standard_bert'): + max_seq_length, masked_lm_prob, seed, + skip_warmup): """Build train, valid, and test datasets.""" @@ -134,16 +127,12 @@ def build_dataset(index, name): kwargs = dict( name=name, data_prefix=data_prefix, - num_epochs=None, - max_num_samples=train_valid_test_num_samples[index], max_seq_length=max_seq_length, seed=seed, ) dataset = NonCausalMLMDataset( indexed_dataset=indexed_dataset, masked_lm_prob=masked_lm_prob, - max_seq_length_dec=max_seq_length_dec, - short_seq_prob=short_seq_prob, **kwargs ) indexed_dataset.set_doc_idx(doc_idx_ptr) @@ -163,9 +152,9 @@ def build_dataset(index, name): class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, - num_epochs, max_num_samples, masked_lm_prob, - max_seq_length, max_seq_length_dec, - short_seq_prob, seed): + masked_lm_prob, + max_seq_length, + seed): # Params to store. self.name = name @@ -366,21 +355,6 @@ def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): return samples_mapping -def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): - """Pad sequences and convert them to numpy.""" - - # Some checks. - num_tokens = len(tokens) - padding_length = max_seq_length - num_tokens - assert padding_length >= 0 - - # Tokens and token types. - filler = np.array([pad_id] * padding_length, dtype=np.int64) - tokens_np = np.concatenate((tokens, filler), dtype=np.int64) - - return tokens_np - - def create_sentinel_ids(mask_indices, vocab_len): """ Sentinel ids creation given the indices that should be masked. diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index a83ded877..b942cd8bc 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,13 +197,11 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length-args.decoder_seq_length, - max_seq_length_dec=args.decoder_seq_length, + max_seq_length=args.seq_length, masked_lm_prob=args.mask_prob, - short_seq_prob=args.short_seq_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), - dataset_type='t5') + ) # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths # elif args.train_weighted_split_paths: From 81b918c9c6c6f6e02f196303295b805f0b964010 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:10:10 +0700 Subject: [PATCH 141/297] fixed mlm dataset --- megatron/data/non_causal_mlm_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index bab07ceba..df66ef9e7 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -161,7 +161,6 @@ def __init__(self, name, indexed_dataset, data_prefix, self.seed = seed self.masked_lm_prob = masked_lm_prob self.max_seq_length = max_seq_length - self.max_seq_length_dec = max_seq_length_dec # Dataset. self.indexed_dataset = indexed_dataset From 95afc4f04e36644f8ba800f0a15f9119c0e0c3ae Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:16:21 +0700 Subject: [PATCH 142/297] fixed mlm dataset --- 4B8-en-ND-MLM.sh | 2 +- train_ND_MLM_gpt.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index a856f2e77..4b25609a7 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -22,7 +22,7 @@ GLOBAL_BATCH_SIZE=512 TRAIN_ITER=48_562 INPUT_LEN=1675 TARGET_LEN=373 -SEQ_LEN=$INPUT_LEN +SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) NLAYERS=24 NHIDDEN=4096 diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index b942cd8bc..c6a729acd 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length, + max_seq_length=args.seq_length-373, masked_lm_prob=args.mask_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), From c4514d8edf12401a20827f2d8bff5a25c85fb9e3 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:20:02 +0700 Subject: [PATCH 143/297] fixed mlm dataset --- megatron/data/non_causal_mlm_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index df66ef9e7..ceb99e7ce 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -252,9 +252,9 @@ def build_training_sample( } -def get_samples_mapping(indexed_dataset, data_prefix, name, max_len=568): +def get_samples_mapping(indexed_dataset, data_prefix, name, max_len): - def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): + def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=None): if idx_list is None: idx_list = [] @@ -314,7 +314,7 @@ def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=max_len): break_len = current_len + sample_len - indices = breakdown(sample_len) + indices = breakdown(sample_len, max_len=max_len) for _start_idx, _end_idx in indices: _len = _end_idx - _start_idx if _len == max_len: From 5cca5af4a86ebac6bee39a1e25e27947325f53fd Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:30:33 +0700 Subject: [PATCH 144/297] fixed mlm dataset --- 4B8-en-ND-MLM.sh | 1 + train_ND_MLM_gpt.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index 4b25609a7..a765b1b4a 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -56,6 +56,7 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ + --encoder-seq-length $INPUT_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index c6a729acd..881b02c45 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.seq_length-373, + max_seq_length=args.encoder_seq_length, masked_lm_prob=args.mask_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), From ae958788de89616500eb85ae6d8add5547e46d9e Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 8 Jun 2022 22:48:30 +0700 Subject: [PATCH 145/297] fixed mlm dataset --- 4B8-en-ND-MLM.sh | 2 +- megatron/arguments.py | 2 ++ train_ND_MLM_gpt.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh index a765b1b4a..75fc3e89d 100644 --- a/4B8-en-ND-MLM.sh +++ b/4B8-en-ND-MLM.sh @@ -56,7 +56,7 @@ GPT_ARGS=" \ --max-position-embeddings $SEQ_LEN \ --position-embedding-type alibi \ --seq-length $SEQ_LEN \ - --encoder-seq-length $INPUT_LEN \ + --input-length $INPUT_LEN \ --micro-batch-size $MICRO_BATCH_SIZE \ --global-batch-size $GLOBAL_BATCH_SIZE \ --train-iters $TRAIN_ITER \ diff --git a/megatron/arguments.py b/megatron/arguments.py index 230bd4d65..aa84ec285 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -880,6 +880,8 @@ def __call__(self, parser, args, values, option_string=None): 'They are used for span masking in the T5 model') group.add_argument('--seq-length', type=int, default=None, help='Maximum sequence length to process.') + group.add_argument('--input-length', type=int, default=None, + help='Maximum sequence length to process.') group.add_argument('--encoder-seq-length', type=int, default=None, help='Maximum encoder sequence length to process.' 'This should be exclusive of --seq-length') diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py index 881b02c45..3f23320e8 100644 --- a/train_ND_MLM_gpt.py +++ b/train_ND_MLM_gpt.py @@ -197,7 +197,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, + max_seq_length=args.input_length, masked_lm_prob=args.mask_prob, seed=args.seed, skip_warmup=(not args.mmap_warmup), From a03e59f36c479cfebb5ef4da2d1423c90bba0b0c Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Tue, 14 Jun 2022 20:36:09 +0700 Subject: [PATCH 146/297] minor changes --- megatron/arguments.py | 2 +- megatron/data/non_causal_mlm_dataset.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index aa84ec285..def726d98 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -881,7 +881,7 @@ def __call__(self, parser, args, values, option_string=None): group.add_argument('--seq-length', type=int, default=None, help='Maximum sequence length to process.') group.add_argument('--input-length', type=int, default=None, - help='Maximum sequence length to process.') + help='Maximum input length to process for MLM adaptation.') group.add_argument('--encoder-seq-length', type=int, default=None, help='Maximum encoder sequence length to process.' 'This should be exclusive of --seq-length') diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py index ceb99e7ce..aa4a45a9f 100644 --- a/megatron/data/non_causal_mlm_dataset.py +++ b/megatron/data/non_causal_mlm_dataset.py @@ -154,7 +154,8 @@ class NonCausalMLMDataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, masked_lm_prob, max_seq_length, - seed): + seed, + max_ngrams = 3): # Params to store. self.name = name @@ -165,14 +166,14 @@ def __init__(self, name, indexed_dataset, data_prefix, # Dataset. self.indexed_dataset = indexed_dataset - max_ngrams = 3 + self.max_ngrams = max_ngrams # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. # To ensure that the input length is `max_seq_length`, we need to increase the maximum length # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. expanded_inputs_length, targets_length = compute_input_and_target_lengths( self.max_seq_length, self.masked_lm_prob, - max_ngrams + self.max_ngrams ) self.expanded_inputs_length = expanded_inputs_length self.targets_length = targets_length From fa1e072d042fa16d64216bb0821ac2e2524f2e4b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 22 Jun 2022 17:41:16 +0700 Subject: [PATCH 147/297] removed mlm related scripts --- 4B8-en-CD-FLM.sh | 157 -------- 4B8-en-ND-MLM.sh | 156 -------- megatron/data/non_causal_mlm_dataset.py | 499 ------------------------ train_ND_MLM_gpt.py | 267 ------------- 4 files changed, 1079 deletions(-) delete mode 100644 4B8-en-CD-FLM.sh delete mode 100644 4B8-en-ND-MLM.sh delete mode 100644 megatron/data/non_causal_mlm_dataset.py delete mode 100644 train_ND_MLM_gpt.py diff --git a/4B8-en-CD-FLM.sh b/4B8-en-CD-FLM.sh deleted file mode 100644 index 17079579d..000000000 --- a/4B8-en-CD-FLM.sh +++ /dev/null @@ -1,157 +0,0 @@ -#!/bin/bash - -EXPERIMENT_NAME=4B8-en-CD-FLM -REPO_PATH=experiments/$EXPERIMENT_NAME -CHECKPOINT_PATH=$REPO_PATH/checkpoints -TENSORBOARD_PATH=$REPO_PATH/tensorboard -CODECARBON_PATH=$REPO_PATH/codecarbon -LOGS_PATH=$REPO_PATH/logs - -DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document - -# XXX: edit me -GPUS_PER_NODE=8 -NNODES=1 -PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here -TP_SIZE=1 # always fixed to the size of a single node -DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer - -MICRO_BATCH_SIZE=32 -GLOBAL_BATCH_SIZE=2048 -TRAIN_ITER=131_072 -SEQ_LEN=626 - - -NLAYERS=24 -NHIDDEN=4096 -NHEADS=64 -FFN_HIDDEN_SIZE=10240 -MAX_POSITION_EMBEDDING=1280 - -SAVE_INTERVAL=1500 - -OPTIMIZER_ARGS=" \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.999 \ - --adam-eps 1e-8 \ - --lr 2e-4 \ - --min-lr 1e-5 \ - --lr-decay-style cosine \ - --clip-grad 1.0 \ - --weight-decay 1e-1 \ - " - -EXIT_OPTS=" \ - --exit-duration-in-mins 1190 \ - " - -GPT_ARGS=" \ - --num-layers $NLAYERS \ - --hidden-size $NHIDDEN \ - --num-attention-heads $NHEADS \ - --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $SEQ_LEN \ - --position-embedding-type alibi \ - --seq-length $SEQ_LEN \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-iters $TRAIN_ITER \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ - --loss-scale 12 \ - --clip-grad 1.0 \ - --fp16 \ - --checkpoint-activations \ - $OPTIMIZER_ARGS \ - $EXIT_OPTS \ - " - -OUTPUT_ARGS=" \ - --log-interval 1 \ - --save-interval $SAVE_INTERVAL \ - --eval-interval $TRAIN_ITER \ - --eval-iters 1 \ - --tensorboard-dir $TENSORBOARD_PATH \ - --tensorboard-queue-size 5 \ - --log-timers-to-tensorboard \ - --log-batch-size-to-tensorboard \ - --log-validation-ppl-to-tensorboard \ - " - -ZERO_STAGE=1 - -config_json="./ds_config.json" - -# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() -cat < $config_json -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, - "train_batch_size": $GLOBAL_BATCH_SIZE, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": $ZERO_STAGE - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "steps_per_print": 2000, - "wall_clock_breakdown": false -} -EOT - - -DEEPSPEED_ARGS=" \ - --deepspeed \ - --deepspeed_config ${config_json} \ - --zero-stage ${ZERO_STAGE} \ - --deepspeed-activation-checkpointing \ - " - -# export LAUNCHER="python -u -m torch.distributed.launch \ -# --nproc_per_node $GPUS_PER_NODE \ -# " -# # --nnodes $NNODES \ -# # --master_addr $MASTER_ADDR \ -# # --master_port $MASTER_PORT \ - -export CMD=" \ - `pwd`/pretrain_gpt.py \ - --tensor-model-parallel-size $TP_SIZE \ - --pipeline-model-parallel-size $PP_SIZE \ - $GPT_ARGS \ - $OUTPUT_ARGS \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - $DEEPSPEED_ARGS \ - " - - -# # clear old checkpoint as it'd mismatch while we sort things out -# rm -rf $SAVE_CHECKPOINT_PATH - - -echo $CMD - -# We create the folder where the logs and codecarbon will be stored. -mkdir -p $REPO_PATH -mkdir -p $LOGS_PATH -# to debug - add echo (it exits and prints what it would have launched) - -# python -u -m torch.distributed.launch \ -# --nproc_per_node $GPUS_PER_NODE \ -# $CMD - -deepspeed --num_gpus $GPUS_PER_NODE \ - $CMD - -# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh deleted file mode 100644 index 75fc3e89d..000000000 --- a/4B8-en-ND-MLM.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash - -EXPERIMENT_NAME=4B8-en-ND-MLM -REPO_PATH=experiments/$EXPERIMENT_NAME -CHECKPOINT_PATH=$REPO_PATH/checkpoints -TENSORBOARD_PATH=$REPO_PATH/tensorboard -CODECARBON_PATH=$REPO_PATH/codecarbon -LOGS_PATH=$REPO_PATH/logs - -DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document -TOKENIZER_PATH=bigscience-tokenizer-padded - -# XXX: edit me -GPUS_PER_NODE=8 -NNODES=1 -PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here -TP_SIZE=1 # always fixed to the size of a single node -DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=512 -TRAIN_ITER=48_562 -INPUT_LEN=1675 -TARGET_LEN=373 -SEQ_LEN=$((INPUT_LEN+TARGET_LEN)) - -NLAYERS=24 -NHIDDEN=4096 -NHEADS=64 -FFN_HIDDEN_SIZE=10240 - - -SAVE_INTERVAL=1500 - -OPTIMIZER_ARGS=" \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.999 \ - --adam-eps 1e-8 \ - --lr 2e-4 \ - --min-lr 1e-5 \ - --lr-decay-style cosine \ - --clip-grad 1.0 \ - --weight-decay 1e-1 \ - " - -EXIT_OPTS=" \ - --exit-duration-in-mins 1190 \ - " - -GPT_ARGS=" \ - --num-layers $NLAYERS \ - --hidden-size $NHIDDEN \ - --num-attention-heads $NHEADS \ - --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $SEQ_LEN \ - --position-embedding-type alibi \ - --seq-length $SEQ_LEN \ - --input-length $INPUT_LEN \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-iters $TRAIN_ITER \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path $TOKENIZER_PATH \ - --loss-scale 12 \ - --clip-grad 1.0 \ - --fp16 \ - --checkpoint-activations \ - $OPTIMIZER_ARGS \ - $EXIT_OPTS \ - " - -OUTPUT_ARGS=" \ - --log-interval 1 \ - --save-interval $SAVE_INTERVAL \ - --eval-interval $TRAIN_ITER \ - --eval-iters 1 \ - --tensorboard-dir $TENSORBOARD_PATH \ - --tensorboard-queue-size 5 \ - --log-timers-to-tensorboard \ - --log-batch-size-to-tensorboard \ - --log-validation-ppl-to-tensorboard \ - " - -ZERO_STAGE=1 - -config_json="./ds_config.json" - -# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() -cat < $config_json -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, - "train_batch_size": $GLOBAL_BATCH_SIZE, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": $ZERO_STAGE - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "steps_per_print": 2000, - "wall_clock_breakdown": false -} -EOT - - -DEEPSPEED_ARGS=" \ - --deepspeed \ - --deepspeed_config ${config_json} \ - --zero-stage ${ZERO_STAGE} \ - --deepspeed-activation-checkpointing \ - " - -# export LAUNCHER="python -u -m torch.distributed.launch \ -# --nproc_per_node $GPUS_PER_NODE \ -# " -# # --nnodes $NNODES \ -# # --master_addr $MASTER_ADDR \ -# # --master_port $MASTER_PORT \ - -export CMD=" \ - `pwd`/train_ND_MLM_gpt.py \ - --tensor-model-parallel-size $TP_SIZE \ - --pipeline-model-parallel-size $PP_SIZE \ - $GPT_ARGS \ - $OUTPUT_ARGS \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - $DEEPSPEED_ARGS \ - " - - -# # clear old checkpoint as it'd mismatch while we sort things out -# rm -rf $SAVE_CHECKPOINT_PATH - - -echo $CMD - -# We create the folder where the logs and codecarbon will be stored. -mkdir -p $REPO_PATH -mkdir -p $LOGS_PATH -# to debug - add echo (it exits and prints what it would have launched) - -deepspeed --num_gpus $GPUS_PER_NODE \ - $CMD - -# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py deleted file mode 100644 index aa4a45a9f..000000000 --- a/megatron/data/non_causal_mlm_dataset.py +++ /dev/null @@ -1,499 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""GPT Non-Causal Mask Language Model Finetune Style dataset.""" - -import os -import time -import random -import collections - -import numpy as np -import torch - -from megatron import mpu, print_rank_0, get_tokenizer -from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, create_masked_lm_predictions -from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_, get_indexed_dataset_ -from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - - -def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - max_seq_length, - masked_lm_prob, seed, - skip_warmup - ): - if len(data_prefix) == 1: - return _build_train_valid_test_datasets(data_prefix[0], - data_impl, splits_string, - train_valid_test_num_samples, - max_seq_length, masked_lm_prob, - seed, skip_warmup - ) - # Blending dataset. - # Parse the values. - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - max_seq_length, masked_lm_prob, - seed, skip_warmup) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - # Blend. - blending_train_dataset = None - if train_datasets: - blending_train_dataset = BlendableDataset(train_datasets, weights) - blending_valid_dataset = None - if valid_datasets: - blending_valid_dataset = BlendableDataset(valid_datasets, weights) - blending_test_dataset = None - if test_datasets: - blending_test_dataset = BlendableDataset(test_datasets, weights) - - return (blending_train_dataset, blending_valid_dataset, - blending_test_dataset) - - -def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - max_seq_length, masked_lm_prob, seed, - skip_warmup): - """Build train, valid, and test datasets.""" - - - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) - - total_num_of_documents = indexed_dataset.sizes.shape[0] - 1 - splits = get_train_valid_test_split_(splits_string, total_num_of_documents) - # Print stats about the splits. - print_rank_0(' > dataset split:') - - def print_split_stats(name, index): - print_rank_0(' {}:'.format(name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], - splits[index + 1] - splits[index])) - start_index = indexed_dataset.doc_idx[splits[index]] - end_index = indexed_dataset.doc_idx[splits[index + 1]] - print_rank_0(' sentence indices in [{}, {}) total of {} ' - 'sentences'.format(start_index, end_index, - end_index - start_index)) - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) - - def build_dataset(index, name): - dataset = None - if splits[index + 1] > splits[index]: - # Get the pointer to the original doc-idx so we can set it later. - doc_idx_ptr = indexed_dataset.get_doc_idx() - # Slice the doc-idx - start_index = splits[index] - # Add +1 so we can index into the dataset to get the upper bound. - end_index = splits[index + 1] + 1 - # New doc_idx view. - indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index]) - # Build the dataset accordingly. - kwargs = dict( - name=name, - data_prefix=data_prefix, - max_seq_length=max_seq_length, - seed=seed, - ) - dataset = NonCausalMLMDataset( - indexed_dataset=indexed_dataset, - masked_lm_prob=masked_lm_prob, - **kwargs - ) - indexed_dataset.set_doc_idx(doc_idx_ptr) - # Checks. - # assert indexed_dataset.doc_idx[0] == 0 - # assert indexed_dataset.doc_idx.shape[0] == \ - # (total_num_of_documents + 1) - return dataset - - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') - - return (train_dataset, valid_dataset, test_dataset) - - -class NonCausalMLMDataset(torch.utils.data.Dataset): - - def __init__(self, name, indexed_dataset, data_prefix, - masked_lm_prob, - max_seq_length, - seed, - max_ngrams = 3): - - # Params to store. - self.name = name - self.seed = seed - self.masked_lm_prob = masked_lm_prob - self.max_seq_length = max_seq_length - - # Dataset. - self.indexed_dataset = indexed_dataset - - self.max_ngrams = max_ngrams - # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. - # To ensure that the input length is `max_seq_length`, we need to increase the maximum length - # according to `masked_lm_prob` and `max_ngrams`. We can also define the label length accordingly. - expanded_inputs_length, targets_length = compute_input_and_target_lengths( - self.max_seq_length, - self.masked_lm_prob, - self.max_ngrams - ) - self.expanded_inputs_length = expanded_inputs_length - self.targets_length = targets_length - - # Build the samples mapping. - self.samples_mapping = get_samples_mapping( - self.indexed_dataset, - data_prefix, - self.name, - max_len=expanded_inputs_length - ) - - # Vocab stuff. - tokenizer = get_tokenizer() - self.vocab_id_list = list(tokenizer.inv_vocab.keys()) - self.vocab_id_to_token_dict = tokenizer.inv_vocab - self.cls_id = tokenizer.cls - self.sep_id = tokenizer.sep - self.mask_id = tokenizer.mask - self.pad_id = tokenizer.pad - self.bos_id = tokenizer.bos_token_id - self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens_ids - assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" - - def __len__(self): - return len(self.samples_mapping) - - def __getitem__(self, idx): - - indices = self.samples_mapping[idx] - sample = [] - for doc_idx, start_index, end_index in indices: - sample.append(self.indexed_dataset.get(doc_idx)[start_index:end_index]) - - return build_training_sample( - sample, self.expanded_inputs_length, self.vocab_id_list, - self.cls_id, self.sep_id, self.mask_id, self.pad_id, self.bos_id, self.eos_id, - self.sentinel_tokens - ) - - -def build_training_sample( - sample, expanded_inputs_length, vocab_id_list, - cls_id, sep_id, mask_id, pad_id, bos_id=None, eos_id=None, sentinel_tokens=None - ): - """Build training sample. - - Arguments: - TODO: Add description - """ - - # flatten sentences into one list - tokens = [token for sentence in sample for token in sentence] - - mask_indices = np.asarray([random_spans_noise_mask( - expanded_inputs_length, - noise_density=0.15, - mean_noise_span_length=3 - )]) - labels_mask = ~mask_indices - - input_ids_sentinel = create_sentinel_ids(mask_indices.astype(np.int8), vocab_len=len(vocab_id_list)) - labels_sentinel = create_sentinel_ids(labels_mask.astype(np.int8), vocab_len=len(vocab_id_list)) - - tokens = np.asarray([tokens]) - input_tokens_ids = filter_input_ids(tokens, input_ids_sentinel, eos_id)[0] - output_tokens_ids = filter_input_ids(tokens, labels_sentinel, eos_id)[0] - - text_tokens_ids = np.concatenate((input_tokens_ids, output_tokens_ids)) - - prefix_len = len(input_tokens_ids) - - return { - 'text': text_tokens_ids, - 'prefix_len': prefix_len - } - - -def get_samples_mapping(indexed_dataset, data_prefix, name, max_len): - - def breakdown(sample_len, idx_offset=None, idx_list=None, max_len=None): - - if idx_list is None: - idx_list = [] - - if idx_offset is None: - idx_offset = 0 - - if sample_len < max_len: - idx_list.append(idx_offset+sample_len) - else: - sample_len = sample_len - max_len - idx_list.append(idx_offset+max_len) - idx_offset += max_len - - breakdown(sample_len, idx_offset=idx_offset, idx_list=idx_list) - - idx_list = [0]+idx_list - return list(zip(idx_list[:-1], idx_list[1:])) - - - # Filename of the index mapping - indexmap_filename = data_prefix - indexmap_filename += '_{}_indexmap'.format(name) - indexmap_filename += '.npy' - - # Build the indexed mapping if not exist. - if torch.distributed.get_rank() == 0 and \ - not os.path.isfile(indexmap_filename): - print(' > WARNING: could not find index map file {}, building ' - 'the indices on rank 0 ...'.format(indexmap_filename)) - - # Make sure the types match the helpers input types. - assert indexed_dataset.doc_idx.dtype == np.int64 - assert indexed_dataset.sizes.dtype == np.int32 - - # Build samples mapping - verbose = torch.distributed.get_rank() == 0 - start_time = time.time() - print_rank_0(' > building sapmles index mapping for {} ...'.format( - name)) - samples_mapping = [] - sample_indices = [] - doc_idx = 0 - current_len = 0 - _idx = 0 - for doc_idx, sample_len in zip(indexed_dataset.doc_idx, indexed_dataset.sizes): - _idx = 0 - - if current_len + sample_len > max_len: - end_idx = max_len - current_len - sample_indices.append([doc_idx, 0, end_idx]) - samples_mapping.append(sample_indices) - sample_indices = [] - current_len = 0 - sample_len -= end_idx - _idx = end_idx - - break_len = current_len + sample_len - - indices = breakdown(sample_len, max_len=max_len) - for _start_idx, _end_idx in indices: - _len = _end_idx - _start_idx - if _len == max_len: - samples_mapping.append([[doc_idx, _start_idx+_idx, _end_idx+_idx]]) - else: - sample_indices.append([doc_idx, _start_idx+_idx, _end_idx+_idx]) - current_len += _len - - print_rank_0(' > done building sapmles index maping') - np.save(indexmap_filename, samples_mapping, allow_pickle=True) - print_rank_0(' > saved the index mapping in {}'.format( - indexmap_filename)) - # Make sure all the ranks have built the mapping - print_rank_0(' > elasped time to build and save samples mapping ' - '(seconds): {:4f}'.format( - time.time() - start_time)) - # This should be a barrier but nccl barrier assumes - # device_index=rank which is not the case for model - # parallel case - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) - - # Load indexed dataset. - print_rank_0(' > loading indexed mapping from {}'.format( - indexmap_filename)) - start_time = time.time() - samples_mapping = np.load(indexmap_filename, allow_pickle=True) - print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( - time.time() - start_time)) - print_rank_0(' total number of samples: {}'.format( - len(samples_mapping))) - - return samples_mapping - - -def create_sentinel_ids(mask_indices, vocab_len): - """ - Sentinel ids creation given the indices that should be masked. - The start indices of each mask are replaced by the sentinel ids in increasing - order. Consecutive mask indices to be deleted are replaced with `-1`. - """ - start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices - start_indices[:, 0] = mask_indices[:, 0] - - sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices) - sentinel_ids = np.where(sentinel_ids != 0, (vocab_len - sentinel_ids), 0) - sentinel_ids -= mask_indices - start_indices - - return sentinel_ids - - -def filter_input_ids(input_ids, sentinel_ids, eos_id): - """ - Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. - This will reduce the sequence length from `expanded_inputs_length` to `input_length`. - """ - batch_size = input_ids.shape[0] - - input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids) - # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are - # masked tokens coming after sentinel tokens and should be removed - input_ids = input_ids_full[input_ids_full >= 0].reshape((batch_size, -1)) - input_ids = np.concatenate( - [input_ids, np.full((batch_size, 1), eos_id, dtype=np.int32)], axis=-1 - ) - return input_ids - - -def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length): - """This function is copy of `random_spans_helper `__ . - Training parameters to avoid padding with random_spans_noise_mask. - When training a model with random_spans_noise_mask, we would like to set the other - training hyperparmeters in a way that avoids padding. - This function helps us compute these hyperparameters. - We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens, - and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens. - This function tells us the required number of tokens in the raw example (for split_tokens()) - as well as the length of the encoded targets. Note that this function assumes - the inputs and targets will have EOS appended and includes that in the reported length. - Args: - inputs_length: an integer - desired length of the tokenized inputs sequence - noise_density: a float - mean_noise_span_length: a float - Returns: - tokens_length: length of original text in tokens - targets_length: an integer - length in tokens of encoded targets sequence - """ - - def _tokens_length_to_inputs_length_targets_length(tokens_length): - num_noise_tokens = int(round(tokens_length * noise_density)) - num_nonnoise_tokens = tokens_length - num_noise_tokens - num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length)) - # inputs contain all nonnoise tokens, sentinels for all noise spans - # and one EOS token. - _input_length = num_nonnoise_tokens + num_noise_spans + 1 - _output_length = num_noise_tokens + num_noise_spans + 1 - return _input_length, _output_length - - tokens_length = inputs_length - - while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length: - tokens_length += 1 - - inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(tokens_length) - - # minor hack to get the targets length to be equal to inputs length - # which is more likely to have been set to a nice round number. - if noise_density == 0.5 and targets_length > inputs_length: - tokens_length -= 1 - targets_length -= 1 - return tokens_length, targets_length - - -def random_spans_noise_mask( - length, - noise_density=0.15, - mean_noise_span_length=3 - ): - - """This function is copy of `random_spans_helper `__ . - Noise mask consisting of random spans of noise tokens. - The number of noise tokens and the number of noise spans and non-noise spans - are determined deterministically as follows: - num_noise_tokens = round(length * noise_density) - num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length) - Spans alternate between non-noise and noise, beginning with non-noise. - Subject to the above restrictions, all masks are equally likely. - Args: - length: an int32 scalar (length of the incoming token sequence) - noise_density: a float - approximate density of output mask - mean_noise_span_length: a number - Returns: - a boolean tensor with shape [length] - """ - - orig_length = length - - num_noise_tokens = int(np.round(length * noise_density)) - # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. - num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) - num_noise_spans = int(np.round(num_noise_tokens / mean_noise_span_length)) - - # avoid degeneracy by ensuring positive number of noise spans - num_noise_spans = max(num_noise_spans, 1) - num_nonnoise_tokens = length - num_noise_tokens - - # pick the lengths of the noise spans and the non-noise spans - def _random_segmentation(num_items, num_segments): - """Partition a sequence of items randomly into non-empty segments. - Args: - num_items: an integer scalar > 0 - num_segments: an integer scalar in [1, num_items] - Returns: - a Tensor with shape [num_segments] containing positive integers that add - up to num_items - """ - mask_indices = np.arange(num_items - 1) < (num_segments - 1) - np.random.shuffle(mask_indices) - first_in_segment = np.pad(mask_indices, [[1, 0]]) - segment_id = np.cumsum(first_in_segment) - # count length of sub segments assuming that list is sorted - _, segment_length = np.unique(segment_id, return_counts=True) - return segment_length - - noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans) - nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans) - - interleaved_span_lengths = np.reshape( - np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2] - ) - span_starts = np.cumsum(interleaved_span_lengths)[:-1] - span_start_indicator = np.zeros((length,), dtype=np.int8) - span_start_indicator[span_starts] = True - span_num = np.cumsum(span_start_indicator) - is_noise = np.equal(span_num % 2, 1) - - return is_noise[:orig_length] \ No newline at end of file diff --git a/train_ND_MLM_gpt.py b/train_ND_MLM_gpt.py deleted file mode 100644 index 3f23320e8..000000000 --- a/train_ND_MLM_gpt.py +++ /dev/null @@ -1,267 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Non-Causal Decoder GPT MLM Adaptation""" - -import torch -from functools import partial -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import get_tokenizer -from megatron import mpu - -from megatron.data.non_causal_mlm_dataset import build_train_valid_test_datasets #, build_dataset_group -from megatron.model import GPTModel, GPTModelPipe -from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ -from megatron.utils import average_losses_across_data_parallel_group - -import deepspeed -from deepspeed.runtime.utils import see_memory_usage -import subprocess - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - - args = get_args() - - with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config, - enabled=args.zero_stage == 3, - mpu=mpu): - if args.deepspeed: - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - prefix_lm=True - ) - # loaded_dir, state_dict = model[0].load_checkpoint( - # args.finetune, load_optimizer_states=False) - # if loaded_dir is None: - # print_rank_0('WARNING: could not find the metadata file {} '.format( - # load_dir)) - # print_rank_0(' will not load any checkpoints and will start from ' - # 'random') - - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe - - else: - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - prefix_lm=True - ) - see_memory_usage(f"After Building Model", force=True) - return model - -_KEYS = ['text', 'prefix_len'] - -def get_batch(data_iterator): - """Generate a batch""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return tokens, labels, loss_mask, attention_mask, position_ids - -def get_batch_pipe(data): - """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and position ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices - -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - -def forward_step(data_iterator, model): - """Forward step.""" - args = get_args() - timers = get_timers() - - # Get the batch. - timers('batch-generator').start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - - output_tensor = model(tokens, position_ids, attention_mask, - labels=labels) - - return output_tensor, partial(loss_func, loss_mask) - - -def train_valid_test_datasets_provider(train_val_test_num_samples): - """Build train, valid, and test datasets.""" - args = get_args() - train_ds, valid_ds, test_ds = None, None, None - - print_rank_0('> building train, validation, and test datasets for GPT ...') - # Option 1 of data loading using --data-path - - if args.data_path: - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.input_length, - masked_lm_prob=args.mask_prob, - seed=args.seed, - skip_warmup=(not args.mmap_warmup), - ) - - # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - # elif args.train_weighted_split_paths: - # assigned_train_valid_test = [] - # if args.train_weighted_split_paths is not None: - # train_ds = [] - # assigned_train_valid_test.append("train") - # if args.valid_weighted_split_paths is not None: - # valid_ds = [] - # assigned_train_valid_test.append("valid") - # if args.test_weighted_split_paths is not None: - # test_ds = [] - # assigned_train_valid_test.append("test") - - # for s in assigned_train_valid_test: - # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - # eval(f"args.{s}_weighted_split_weights"), - # eval(f"args.{s}_weighted_split_splits"), - # eval(f"args.{s}_weighted_split_names")) - # for paths, weights, splits, name in data_groups: - # d = build_dataset_group(name, paths, weights, splits, - # args.data_impl, - # train_val_test_num_samples, - # args.seq_length, args.seed, - # (not args.mmap_warmup), - # train_valid_test=s) - # eval(f"{s}_ds").append(d) - # else: - # raise NotImplementedError("No dataloading argument passed") - - print_rank_0("> finished creating GPT datasets ...") - return train_ds, valid_ds, test_ds - -def command_exists(cmd): - result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) - return result.wait() == 0 - -def git_ds_info(): - from deepspeed.env_report import main as ds_report - ds_report() - - # Write out version/git info - git_hash_cmd = "git rev-parse --short HEAD" - git_branch_cmd = "git rev-parse --abbrev-ref HEAD" - if command_exists('git'): - try: - result = subprocess.check_output(git_hash_cmd, shell=True) - git_hash = result.decode('utf-8').strip() - result = subprocess.check_output(git_branch_cmd, shell=True) - git_branch = result.decode('utf-8').strip() - except subprocess.CalledProcessError: - git_hash = "unknown" - git_branch = "unknown" - else: - git_hash = "unknown" - git_branch = "unknown" - print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') - - -if __name__ == "__main__": - git_ds_info() - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From e3ce0a76331eceab48233b4a7ba99b5bb1a32ed1 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 22 Jun 2022 19:20:25 +0700 Subject: [PATCH 148/297] removed any scipts not related to dataset, revert arguments --- 4B8-en-ND-MTF.sh | 155 ----------------------- megatron/arguments.py | 2 - prepare_tokenizer.py | 25 ---- train_ND_MTF_gpt.py | 277 ------------------------------------------ 4 files changed, 459 deletions(-) delete mode 100644 4B8-en-ND-MTF.sh delete mode 100644 prepare_tokenizer.py delete mode 100644 train_ND_MTF_gpt.py diff --git a/4B8-en-ND-MTF.sh b/4B8-en-ND-MTF.sh deleted file mode 100644 index 209732ad3..000000000 --- a/4B8-en-ND-MTF.sh +++ /dev/null @@ -1,155 +0,0 @@ -#!/bin/bash - -EXPERIMENT_NAME=4B8-en-ND-MTF -REPO_PATH=experiments/$EXPERIMENT_NAME -CHECKPOINT_PATH=$REPO_PATH/checkpoints -TENSORBOARD_PATH=$REPO_PATH/tensorboard -CODECARBON_PATH=$REPO_PATH/codecarbon -LOGS_PATH=$REPO_PATH/logs - -DATA_PATH=data/mc4-id_text_document - -# XXX: edit me -GPUS_PER_NODE=8 -NNODES=1 -PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here -TP_SIZE=1 # always fixed to the size of a single node -DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=1024 -TRAIN_ITER=10_000 -INPUT_LEN=1024 -TARGET_LEN=256 - -NLAYERS=24 -NHIDDEN=4096 -NHEADS=64 -FFN_HIDDEN_SIZE=10240 -MAX_POSITION_EMBEDDING=1280 - -SAVE_INTERVAL=1500 - -OPTIMIZER_ARGS=" \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.999 \ - --adam-eps 1e-8 \ - --lr 2e-4 \ - --min-lr 1e-5 \ - --lr-decay-style cosine \ - --clip-grad 1.0 \ - --weight-decay 1e-1 \ - " - -EXIT_OPTS=" \ - --exit-duration-in-mins 1190 \ - " - -GPT_ARGS=" \ - --num-layers $NLAYERS \ - --hidden-size $NHIDDEN \ - --num-attention-heads $NHEADS \ - --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $SEQ_LEN \ - --position-embedding-type alibi \ - --encoder-seq-length $INPUT_LEN \ - --decoder-seq-length $TARGET_LEN \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-iters $TRAIN_ITER \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path bigscience/tokenizer \ - --loss-scale 12 \ - --clip-grad 1.0 \ - --fp16 \ - --checkpoint-activations \ - $OPTIMIZER_ARGS \ - $EXIT_OPTS \ - " - -OUTPUT_ARGS=" \ - --log-interval 200 \ - --save-interval $SAVE_INTERVAL \ - --eval-interval $TRAIN_ITER \ - --eval-iters 1 \ - --tensorboard-dir $TENSORBOARD_PATH \ - --tensorboard-queue-size 5 \ - --log-timers-to-tensorboard \ - --log-batch-size-to-tensorboard \ - --log-validation-ppl-to-tensorboard \ - " - -ZERO_STAGE=1 - -config_json="./ds_config.json" - -# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() -cat < $config_json -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, - "train_batch_size": $GLOBAL_BATCH_SIZE, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": $ZERO_STAGE - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "steps_per_print": 2000, - "wall_clock_breakdown": false -} -EOT - - -DEEPSPEED_ARGS=" \ - --deepspeed \ - --deepspeed_config ${config_json} \ - --zero-stage ${ZERO_STAGE} \ - --deepspeed-activation-checkpointing \ - " - -# export LAUNCHER="python -u -m torch.distributed.launch \ -# --nproc_per_node $GPUS_PER_NODE \ -# " -# # --nnodes $NNODES \ -# # --master_addr $MASTER_ADDR \ -# # --master_port $MASTER_PORT \ - -export CMD=" \ - `pwd`/train_ND_MTF_gpt.py \ - --tensor-model-parallel-size $TP_SIZE \ - --pipeline-model-parallel-size $PP_SIZE \ - $GPT_ARGS \ - $OUTPUT_ARGS \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - $DEEPSPEED_ARGS \ - " - - -# # clear old checkpoint as it'd mismatch while we sort things out -# rm -rf $SAVE_CHECKPOINT_PATH - - -echo $CMD - -# We create the folder where the logs and codecarbon will be stored. -mkdir -p $REPO_PATH -mkdir -p $LOGS_PATH -# to debug - add echo (it exits and prints what it would have launched) - -python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ - $CMD - -# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/megatron/arguments.py b/megatron/arguments.py index def726d98..230bd4d65 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -880,8 +880,6 @@ def __call__(self, parser, args, values, option_string=None): 'They are used for span masking in the T5 model') group.add_argument('--seq-length', type=int, default=None, help='Maximum sequence length to process.') - group.add_argument('--input-length', type=int, default=None, - help='Maximum input length to process for MLM adaptation.') group.add_argument('--encoder-seq-length', type=int, default=None, help='Maximum encoder sequence length to process.' 'This should be exclusive of --seq-length') diff --git a/prepare_tokenizer.py b/prepare_tokenizer.py deleted file mode 100644 index 280ba458d..000000000 --- a/prepare_tokenizer.py +++ /dev/null @@ -1,25 +0,0 @@ -from transformers import AutoTokenizer, AddedToken - -tokenizer = AutoTokenizer.from_pretrained('bigscience/tokenizer') - -tokenizer.add_special_tokens({ - 'additional_special_tokens': [ - AddedToken( - ''.format(str(idx).zfill(3)), - lstrip=False, - rstrip=False, - normalization=False - ) for idx in reversed(range(0,200)) - ] - }) - -tokenizer.save_pretrained('bigscience-tokenizer-padded') - -# python tools/preprocess_data.py \ -# --input data/oscar-en-10k.jsonl \ -# --output-prefix data/meg-gpt2-oscar-en-10k \ -# --dataset-impl mmap \ -# --tokenizer-type PretrainedFromHF \ -# --tokenizer-name-or-path bigscience-tokenizer-padded \ -# --append-eod \ -# --workers 4 \ No newline at end of file diff --git a/train_ND_MTF_gpt.py b/train_ND_MTF_gpt.py deleted file mode 100644 index 69b8c825b..000000000 --- a/train_ND_MTF_gpt.py +++ /dev/null @@ -1,277 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Non-Causal Decoder GPT Multitask Finetuning""" - -import torch -from functools import partial -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import get_tokenizer -from megatron import mpu - -from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets #, build_dataset_group -from megatron.model import GPTModel, GPTModelPipe -from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ -from megatron.utils import average_losses_across_data_parallel_group - -import deepspeed -from deepspeed.runtime.utils import see_memory_usage -import subprocess - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - - args = get_args() - - with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config, - enabled=args.zero_stage == 3, - mpu=mpu): - if args.deepspeed: - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - prefix_lm=True - ) - # loaded_dir, state_dict = model[0].load_checkpoint( - # args.finetune, load_optimizer_states=False) - # if loaded_dir is None: - # print_rank_0('WARNING: could not find the metadata file {} '.format( - # load_dir)) - # print_rank_0(' will not load any checkpoints and will start from ' - # 'random') - - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe - - else: - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - prefix_lm=True - ) - see_memory_usage(f"After Building Model", force=True) - return model - -_KEYS = ['text', 'prefix_len'] - -def get_batch(data_iterator): - """Generate a batch""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return tokens, labels, loss_mask, attention_mask, position_ids - -def get_batch_pipe(data): - """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and position ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices - -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - -def forward_step(data_iterator, model): - """Forward step.""" - args = get_args() - timers = get_timers() - - # Get the batch. - timers('batch-generator').start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - - output_tensor = model(tokens, position_ids, attention_mask, - labels=labels) - - return output_tensor, partial(loss_func, loss_mask) - - -def train_valid_test_datasets_provider(train_val_test_num_samples): - """Build train, valid, and test datasets.""" - args = get_args() - train_ds, valid_ds, test_ds = None, None, None - - print_rank_0('> building train, validation, and test datasets for GPT ...') - # Option 1 of data loading using --data-path - - if args.data_path: - # train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - # data_prefix=args.data_path, - # data_impl=args.data_impl, - # splits_string=args.split, - # train_valid_test_num_samples=train_val_test_num_samples, - # seq_length=args.seq_length, - # seed=args.seed, - # skip_warmup=(not args.mmap_warmup)) - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, - max_seq_length_dec=args.decoder_seq_length, - masked_lm_prob=args.mask_prob, - short_seq_prob=args.short_seq_prob, - seed=args.seed, - skip_warmup=(not args.mmap_warmup), - dataset_type='t5') - - # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - # elif args.train_weighted_split_paths: - # assigned_train_valid_test = [] - # if args.train_weighted_split_paths is not None: - # train_ds = [] - # assigned_train_valid_test.append("train") - # if args.valid_weighted_split_paths is not None: - # valid_ds = [] - # assigned_train_valid_test.append("valid") - # if args.test_weighted_split_paths is not None: - # test_ds = [] - # assigned_train_valid_test.append("test") - - # for s in assigned_train_valid_test: - # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - # eval(f"args.{s}_weighted_split_weights"), - # eval(f"args.{s}_weighted_split_splits"), - # eval(f"args.{s}_weighted_split_names")) - # for paths, weights, splits, name in data_groups: - # d = build_dataset_group(name, paths, weights, splits, - # args.data_impl, - # train_val_test_num_samples, - # args.seq_length, args.seed, - # (not args.mmap_warmup), - # train_valid_test=s) - # eval(f"{s}_ds").append(d) - # else: - # raise NotImplementedError("No dataloading argument passed") - - print_rank_0("> finished creating GPT datasets ...") - return train_ds, valid_ds, test_ds - -def command_exists(cmd): - result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) - return result.wait() == 0 - -def git_ds_info(): - from deepspeed.env_report import main as ds_report - ds_report() - - # Write out version/git info - git_hash_cmd = "git rev-parse --short HEAD" - git_branch_cmd = "git rev-parse --abbrev-ref HEAD" - if command_exists('git'): - try: - result = subprocess.check_output(git_hash_cmd, shell=True) - git_hash = result.decode('utf-8').strip() - result = subprocess.check_output(git_branch_cmd, shell=True) - git_branch = result.decode('utf-8').strip() - except subprocess.CalledProcessError: - git_hash = "unknown" - git_branch = "unknown" - else: - git_hash = "unknown" - git_branch = "unknown" - print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') - - -if __name__ == "__main__": - git_ds_info() - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 87e4055cf0549456233d2b92cbe32b40700cda98 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 23 Jun 2022 17:05:59 +0700 Subject: [PATCH 149/297] added sampler and test --- megatron/data/data_samplers.py | 64 ++++++++++++++++++++++++++++++++ tests/test_packing_dataloader.py | 37 ++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 tests/test_packing_dataloader.py diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index c8109b3d2..ee436debb 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -44,6 +44,14 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) + elif args.dataloader_type == 'packed': + batch_sampler = MegatronPackedRandomSampler( + sequence_length=args.seq_length, + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=args.micro_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), + data_parallel_size=mpu.get_data_parallel_world_size()) else: raise Exception('{} dataloader type is not supported.'.format( args.dataloader_type)) @@ -158,3 +166,59 @@ def __iter__(self): self.consumed_samples += self.micro_batch_times_data_parallel_size yield batch batch = [] + + +class MegatronPackedRandomSampler(object): + """docstring for MegatronPackedRandomSampler""" + def __init__(self, sequence_length, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size): + # Keep a copy of input params for later use. + self.sequence_length = sequence_length + self.total_samples = total_samples + self.consumed_samples = consumed_samples + self.micro_batch_size = micro_batch_size + self.data_parallel_rank = data_parallel_rank + self.data_parallel_size = data_parallel_size + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size + self.last_batch_size = \ + self.total_samples % self.micro_batch_times_data_parallel_size + + # Sanity checks. + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) + assert self.micro_batch_size > 0 + assert data_parallel_size > 0 + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) + + def __len__(self): + return self.total_samples + + def __iter__(self): + active_total_samples = self.total_samples - self.last_batch_size + self.epoch = self.consumed_samples // active_total_samples + current_epoch_samples = self.consumed_samples % active_total_samples + assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 + + # data sharding and random sampling + bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ + * self.micro_batch_size + bucket_offset = current_epoch_samples // self.data_parallel_size + start_idx = self.data_parallel_rank * bucket_size + + g = torch.Generator() + g.manual_seed(self.epoch) + + random_idx = torch.randperm(bucket_size, generator=g).tolist() + idx_range = [start_idx + x for x in random_idx[bucket_offset:]] + + batch = [] + # Last batch if not complete will be dropped. + for idx in idx_range: + batch.append(idx) + if len(batch) == self.micro_batch_size: + self.consumed_samples += self.micro_batch_times_data_parallel_size + yield batch + batch = [] diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py new file mode 100644 index 000000000..d5af66d62 --- /dev/null +++ b/tests/test_packing_dataloader.py @@ -0,0 +1,37 @@ +import os +import torch.distributed as dist + +from megatron.initialize import initialize_megatron +# from megatron.data.data_samplers import MegatronPackedRandomSampler +from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group + +#Initialize Megatron with dummy variables +initialize_megatron( + extra_args_provider=None, + args_defaults={ + "micro_batch_size": 4, + "num_layers": 4, + "hidden_size": 64, + "num_attention_heads": 4, + "seq_length": 256, + "max_position_embeddings": 256, + } + ) + +train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=["tests/data/gpt2/meg-gpt2-openwebtext_text_document"], + data_impl="mmap", + splits_string="90,5,5", + train_valid_test_num_samples=[100,100,100], + seq_length=1024, + seed=124, + skip_warmup=True + ) + +dl = torch.utils.data.DataLoader( + train_ds, + batch_size=4, + # batch_sampler=batch_sampler, + num_workers=4, + pin_memory=True + ) From 0ae7661de6ffa02a71d1893a5cc0b7056be21d74 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 23 Jun 2022 21:14:47 +0700 Subject: [PATCH 150/297] added testing data --- megatron/data/non_causal_mtf_dataset.py | 16 +- .../t0/ag_news_classify_question_first.json | 100 + .../data/t0/ag_news_prompt_text_document.bin | Bin 0 -> 18494 bytes .../data/t0/ag_news_prompt_text_document.idx | Bin 0 -> 2042 bytes tests/data/t0/gpt2-tiny-merges.txt | 4744 +++++++++++++++++ tests/data/t0/gpt2-tiny-vocab.json | 1 + tests/test_packing_dataloader.py | 3 +- 7 files changed, 4851 insertions(+), 13 deletions(-) create mode 100644 tests/data/t0/ag_news_classify_question_first.json create mode 100644 tests/data/t0/ag_news_prompt_text_document.bin create mode 100644 tests/data/t0/ag_news_prompt_text_document.idx create mode 100644 tests/data/t0/gpt2-tiny-merges.txt create mode 100644 tests/data/t0/gpt2-tiny-vocab.json diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 6bce2c4ef..cae8d4a54 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -271,20 +271,12 @@ def __getitem__(self, idx): ) eod_idx = np.where(sample == self.tokenizer.eod)[0] - if len(eod_idx) > 0: - prefix_len = eod_idx[0] - else: - prefix_len = 0 - - sample = pad_and_convert_to_numpy( - sample, - self.tokenizer.pad, - self.seq_length - ) + input_tokens = sample[:eod_idx] + target_tokens = sample[eod_idx:] return { - 'text': np.array(sample, dtype=np.int64), - 'prefix_len': prefix_len + 'input_tokens': np.array(input_tokens, dtype=np.int64), + 'target_tokens': np.array(target_tokens, dtype=np.int64), } diff --git a/tests/data/t0/ag_news_classify_question_first.json b/tests/data/t0/ag_news_classify_question_first.json new file mode 100644 index 000000000..3f82f55d2 --- /dev/null +++ b/tests/data/t0/ag_news_classify_question_first.json @@ -0,0 +1,100 @@ +{"text": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", "targets": "Business"} +{"text": "What label best describes this news article?\nCarlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.<|endoftext|>Business", "inputs": "What label best describes this news article?\nCarlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.", "targets": "Business"} +{"text": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.", "targets": "Business"} +{"text": "What label best describes this news article?\nIraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.", "targets": "Business"} +{"text": "What label best describes this news article?\nOil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections.", "targets": "Business"} +{"text": "What label best describes this news article?\nStocks End Up, But Near Year Lows (Reuters) Reuters - Stocks ended slightly higher on Friday\\but stayed near lows for the year as oil prices surged past #36;46\\a barrel, offsetting a positive outlook from computer maker\\Dell Inc. (DELL.O)<|endoftext|>Business", "inputs": "What label best describes this news article?\nStocks End Up, But Near Year Lows (Reuters) Reuters - Stocks ended slightly higher on Friday\\but stayed near lows for the year as oil prices surged past #36;46\\a barrel, offsetting a positive outlook from computer maker\\Dell Inc. (DELL.O)", "targets": "Business"} +{"text": "What label best describes this news article?\nMoney Funds Fell in Latest Week (AP) AP - Assets of the nation's retail money market mutual funds fell by #36;1.17 billion in the latest week to #36;849.98 trillion, the Investment Company Institute said Thursday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMoney Funds Fell in Latest Week (AP) AP - Assets of the nation's retail money market mutual funds fell by #36;1.17 billion in the latest week to #36;849.98 trillion, the Investment Company Institute said Thursday.", "targets": "Business"} +{"text": "What label best describes this news article?\nFed minutes show dissent over inflation (USATODAY.com) USATODAY.com - Retail sales bounced back a bit in July, and new claims for jobless benefits fell last week, the government said Thursday, indicating the economy is improving from a midsummer slump.<|endoftext|>Business", "inputs": "What label best describes this news article?\nFed minutes show dissent over inflation (USATODAY.com) USATODAY.com - Retail sales bounced back a bit in July, and new claims for jobless benefits fell last week, the government said Thursday, indicating the economy is improving from a midsummer slump.", "targets": "Business"} +{"text": "What label best describes this news article?\nSafety Net (Forbes.com) Forbes.com - After earning a PH.D. in Sociology, Danny Bazil Riley started to work as the general manager at a commercial real estate firm at an annual base salary of #36;70,000. Soon after, a financial planner stopped by his desk to drop off brochures about insurance benefits available through his employer. But, at 32, \"buying insurance was the furthest thing from my mind,\" says Riley.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSafety Net (Forbes.com) Forbes.com - After earning a PH.D. in Sociology, Danny Bazil Riley started to work as the general manager at a commercial real estate firm at an annual base salary of #36;70,000. Soon after, a financial planner stopped by his desk to drop off brochures about insurance benefits available through his employer. But, at 32, \"buying insurance was the furthest thing from my mind,\" says Riley.", "targets": "Business"} +{"text": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again.", "targets": "Business"} +{"text": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook NEW YORK (Reuters) - Soaring crude prices plus worries about the economy and the outlook for earnings are expected to hang over the stock market next week during the depth of the summer doldrums.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook NEW YORK (Reuters) - Soaring crude prices plus worries about the economy and the outlook for earnings are expected to hang over the stock market next week during the depth of the summer doldrums.", "targets": "Business"} +{"text": "What label best describes this news article?\nNo Need for OPEC to Pump More-Iran Gov TEHRAN (Reuters) - OPEC can do nothing to douse scorching oil prices when markets are already oversupplied by 2.8 million barrels per day (bpd) of crude, Iran's OPEC governor said Saturday, warning that prices could fall sharply.<|endoftext|>Business", "inputs": "What label best describes this news article?\nNo Need for OPEC to Pump More-Iran Gov TEHRAN (Reuters) - OPEC can do nothing to douse scorching oil prices when markets are already oversupplied by 2.8 million barrels per day (bpd) of crude, Iran's OPEC governor said Saturday, warning that prices could fall sharply.", "targets": "Business"} +{"text": "What label best describes this news article?\nNon-OPEC Nations Should Up Output-Purnomo JAKARTA (Reuters) - Non-OPEC oil exporters should consider increasing output to cool record crude prices, OPEC President Purnomo Yusgiantoro said on Sunday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nNon-OPEC Nations Should Up Output-Purnomo JAKARTA (Reuters) - Non-OPEC oil exporters should consider increasing output to cool record crude prices, OPEC President Purnomo Yusgiantoro said on Sunday.", "targets": "Business"} +{"text": "What label best describes this news article?\nGoogle IPO Auction Off to Rocky Start WASHINGTON/NEW YORK (Reuters) - The auction for Google Inc.'s highly anticipated initial public offering got off to a rocky start on Friday after the Web search company sidestepped a bullet from U.S. securities regulators.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle IPO Auction Off to Rocky Start WASHINGTON/NEW YORK (Reuters) - The auction for Google Inc.'s highly anticipated initial public offering got off to a rocky start on Friday after the Web search company sidestepped a bullet from U.S. securities regulators.", "targets": "Business"} +{"text": "What label best describes this news article?\nDollar Falls Broadly on Record Trade Gap NEW YORK (Reuters) - The dollar tumbled broadly on Friday after data showing a record U.S. trade deficit in June cast fresh doubts on the economy's recovery and its ability to draw foreign capital to fund the growing gap.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDollar Falls Broadly on Record Trade Gap NEW YORK (Reuters) - The dollar tumbled broadly on Friday after data showing a record U.S. trade deficit in June cast fresh doubts on the economy's recovery and its ability to draw foreign capital to fund the growing gap.", "targets": "Business"} +{"text": "What label best describes this news article?\nRescuing an Old Saver If you think you may need to help your elderly relatives with their finances, don't be shy about having the money talk -- soon.<|endoftext|>Business", "inputs": "What label best describes this news article?\nRescuing an Old Saver If you think you may need to help your elderly relatives with their finances, don't be shy about having the money talk -- soon.", "targets": "Business"} +{"text": "What label best describes this news article?\nKids Rule for Back-to-School The purchasing power of kids is a big part of why the back-to-school season has become such a huge marketing phenomenon.<|endoftext|>Business", "inputs": "What label best describes this news article?\nKids Rule for Back-to-School The purchasing power of kids is a big part of why the back-to-school season has become such a huge marketing phenomenon.", "targets": "Business"} +{"text": "What label best describes this news article?\nIn a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market these days, but investors in value-focused mutual funds have reason to feel a bit smug -- if only because they've lost less than the folks who stuck with growth.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIn a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market these days, but investors in value-focused mutual funds have reason to feel a bit smug -- if only because they've lost less than the folks who stuck with growth.", "targets": "Business"} +{"text": "What label best describes this news article?\nUS trade deficit swells in June The US trade deficit has exploded 19 to a record \\$55.8bn as oil costs drove imports higher, according to a latest figures.<|endoftext|>Business", "inputs": "What label best describes this news article?\nUS trade deficit swells in June The US trade deficit has exploded 19 to a record \\$55.8bn as oil costs drove imports higher, according to a latest figures.", "targets": "Business"} +{"text": "What label best describes this news article?\nShell 'could be target for Total' Oil giant Shell could be bracing itself for a takeover attempt, possibly from French rival Total, a press report claims.<|endoftext|>Business", "inputs": "What label best describes this news article?\nShell 'could be target for Total' Oil giant Shell could be bracing itself for a takeover attempt, possibly from French rival Total, a press report claims.", "targets": "Business"} +{"text": "What label best describes this news article?\nGoogle IPO faces Playboy slip-up The bidding gets underway for Google's public offering, despite last-minute worries over an interview with its bosses in Playboy magazine.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle IPO faces Playboy slip-up The bidding gets underway for Google's public offering, despite last-minute worries over an interview with its bosses in Playboy magazine.", "targets": "Business"} +{"text": "What label best describes this news article?\nEurozone economy keeps growing Official figures show the 12-nation eurozone economy continues to grow, but there are warnings it may slow down later in the year.<|endoftext|>Business", "inputs": "What label best describes this news article?\nEurozone economy keeps growing Official figures show the 12-nation eurozone economy continues to grow, but there are warnings it may slow down later in the year.", "targets": "Business"} +{"text": "What label best describes this news article?\nExpansion slows in Japan Economic growth in Japan slows down as the country experiences a drop in domestic and corporate spending.<|endoftext|>Business", "inputs": "What label best describes this news article?\nExpansion slows in Japan Economic growth in Japan slows down as the country experiences a drop in domestic and corporate spending.", "targets": "Business"} +{"text": "What label best describes this news article?\nRand falls on shock SA rate cut Interest rates are trimmed to 7.5 by the South African central bank, but the lack of warning hits the rand and surprises markets.<|endoftext|>Business", "inputs": "What label best describes this news article?\nRand falls on shock SA rate cut Interest rates are trimmed to 7.5 by the South African central bank, but the lack of warning hits the rand and surprises markets.", "targets": "Business"} +{"text": "What label best describes this news article?\nCar prices down across the board The cost of buying both new and second hand cars fell sharply over the past five years, a new survey has found.<|endoftext|>Business", "inputs": "What label best describes this news article?\nCar prices down across the board The cost of buying both new and second hand cars fell sharply over the past five years, a new survey has found.", "targets": "Business"} +{"text": "What label best describes this news article?\nSouth Korea lowers interest rates South Korea's central bank cuts interest rates by a quarter percentage point to 3.5 in a bid to drive growth in the economy.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSouth Korea lowers interest rates South Korea's central bank cuts interest rates by a quarter percentage point to 3.5 in a bid to drive growth in the economy.", "targets": "Business"} +{"text": "What label best describes this news article?\nGoogle auction begins on Friday An auction of shares in Google, the web search engine which could be floated for as much as \\$36bn, takes place on Friday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle auction begins on Friday An auction of shares in Google, the web search engine which could be floated for as much as \\$36bn, takes place on Friday.", "targets": "Business"} +{"text": "What label best describes this news article?\nHP shares tumble on profit news Hewlett-Packard shares fall after disappointing third-quarter profits, while the firm warns the final quarter will also fall short of expectations.<|endoftext|>Business", "inputs": "What label best describes this news article?\nHP shares tumble on profit news Hewlett-Packard shares fall after disappointing third-quarter profits, while the firm warns the final quarter will also fall short of expectations.", "targets": "Business"} +{"text": "What label best describes this news article?\nMauritian textile firm cuts jobs One of the oldest textile operators on the Indian Ocean island of Mauritius last week shut seven factories and cut 900 jobs.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMauritian textile firm cuts jobs One of the oldest textile operators on the Indian Ocean island of Mauritius last week shut seven factories and cut 900 jobs.", "targets": "Business"} +{"text": "What label best describes this news article?\nChad seeks refugee aid from IMF Chad asks the IMF for a loan to pay for looking after more than 100,000 refugees from conflict-torn Darfur in western Sudan.<|endoftext|>Business", "inputs": "What label best describes this news article?\nChad seeks refugee aid from IMF Chad asks the IMF for a loan to pay for looking after more than 100,000 refugees from conflict-torn Darfur in western Sudan.", "targets": "Business"} +{"text": "What label best describes this news article?\nJapan nuclear firm shuts plants The company running the Japanese nuclear plant hit by a fatal accident is to close its reactors for safety checks.<|endoftext|>Business", "inputs": "What label best describes this news article?\nJapan nuclear firm shuts plants The company running the Japanese nuclear plant hit by a fatal accident is to close its reactors for safety checks.", "targets": "Business"} +{"text": "What label best describes this news article?\nVeteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freeplay clockwork radio, is planning to float his company on the stock market.<|endoftext|>Business", "inputs": "What label best describes this news article?\nVeteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freeplay clockwork radio, is planning to float his company on the stock market.", "targets": "Business"} +{"text": "What label best describes this news article?\nSaudi Arabia to open up oil taps Saudi Arabia says it is ready to push an extra 1.3 million barrels a day of oil into the market, to help reverse surging prices.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSaudi Arabia to open up oil taps Saudi Arabia says it is ready to push an extra 1.3 million barrels a day of oil into the market, to help reverse surging prices.", "targets": "Business"} +{"text": "What label best describes this news article?\nSaudi phone sector gets \\$1bn lift A group led by the UAE's Etisalat plans to spend \\$1bn (544m) on expansion after winning two mobile phone licences in Saudi Arabia.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSaudi phone sector gets \\$1bn lift A group led by the UAE's Etisalat plans to spend \\$1bn (544m) on expansion after winning two mobile phone licences in Saudi Arabia.", "targets": "Business"} +{"text": "What label best describes this news article?\nIndians fill rail skills shortage Network Rail flies in specialist Indian engineers to work on the West Coast Mainline because of a UK skills shortage.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIndians fill rail skills shortage Network Rail flies in specialist Indian engineers to work on the West Coast Mainline because of a UK skills shortage.", "targets": "Business"} +{"text": "What label best describes this news article?\nSteady as they go BEDFORD -- Scientists at NitroMed Inc. hope their experimental drugs will cure heart disease someday. But lately their focus has been on more mundane matters.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSteady as they go BEDFORD -- Scientists at NitroMed Inc. hope their experimental drugs will cure heart disease someday. But lately their focus has been on more mundane matters.", "targets": "Business"} +{"text": "What label best describes this news article?\nGoogle IPO: Type in 'confusing,' 'secrecy' I've submitted my bid to buy shares of Google Inc. in the computer search company's giant auction-style initial public offering. That could turn out to be the good news or the bad news.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle IPO: Type in 'confusing,' 'secrecy' I've submitted my bid to buy shares of Google Inc. in the computer search company's giant auction-style initial public offering. That could turn out to be the good news or the bad news.", "targets": "Business"} +{"text": "What label best describes this news article?\nA bargain hunter's paradise Massachusetts bargain hunters showed up in droves and shopped hard on yesterday's sales tax holiday, buying everything from treadmills and snow blowers to candles and chandeliers, and crediting the 5-percent tax break with bringing them into the stores.<|endoftext|>Business", "inputs": "What label best describes this news article?\nA bargain hunter's paradise Massachusetts bargain hunters showed up in droves and shopped hard on yesterday's sales tax holiday, buying everything from treadmills and snow blowers to candles and chandeliers, and crediting the 5-percent tax break with bringing them into the stores.", "targets": "Business"} +{"text": "What label best describes this news article?\nResearchers seek to untangle the e-mail thread E-mail is a victim of its own success. That's the conclusion of IBM Corp. researchers in Cambridge, who have spent nearly a decade conducting field tests at IBM and other companies about how employees work and use electronic mail. It's clear to them that e-mail has become the Internet's killer application.<|endoftext|>Business", "inputs": "What label best describes this news article?\nResearchers seek to untangle the e-mail thread E-mail is a victim of its own success. That's the conclusion of IBM Corp. researchers in Cambridge, who have spent nearly a decade conducting field tests at IBM and other companies about how employees work and use electronic mail. It's clear to them that e-mail has become the Internet's killer application.", "targets": "Business"} +{"text": "What label best describes this news article?\nMicrosoft Corp. 2.0: a kinder corporate culture Even a genius can mess up. Bill Gates was a brilliant technologist when he cofounded Microsoft , but as he guided it to greatness in both size and historical consequence, he blundered. He terrorized underlings with his temper and parceled out praise like Scrooge gave to charity. Only the lash inspired the necessary aggressiveness to beat the competition, he thought.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMicrosoft Corp. 2.0: a kinder corporate culture Even a genius can mess up. Bill Gates was a brilliant technologist when he cofounded Microsoft , but as he guided it to greatness in both size and historical consequence, he blundered. He terrorized underlings with his temper and parceled out praise like Scrooge gave to charity. Only the lash inspired the necessary aggressiveness to beat the competition, he thought.", "targets": "Business"} +{"text": "What label best describes this news article?\nLetters Target the abusers of legal weapons We can all share the outrage, expressed by columnist Steve Bailey (''Summer Sizzler, quot; Aug. 11), at the killings in the city's poor neighborhoods. But there's no need to share his ignorance. He argues for renewal of the so-called assault weapon ban, claiming that otherwise, ''UZIs and AK-47s could again be flooding the streets. quot; His ...<|endoftext|>Business", "inputs": "What label best describes this news article?\nLetters Target the abusers of legal weapons We can all share the outrage, expressed by columnist Steve Bailey (''Summer Sizzler, quot; Aug. 11), at the killings in the city's poor neighborhoods. But there's no need to share his ignorance. He argues for renewal of the so-called assault weapon ban, claiming that otherwise, ''UZIs and AK-47s could again be flooding the streets. quot; His ...", "targets": "Business"} +{"text": "What label best describes this news article?\nSomewhere between gleam and gloom President Bush has been saying that the US economy has ''turned the corner. quot; Democratic presidential candidate Senator John F. Kerry, in the wake of this month's poor jobs report, quipped that it was more like a U-turn.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSomewhere between gleam and gloom President Bush has been saying that the US economy has ''turned the corner. quot; Democratic presidential candidate Senator John F. Kerry, in the wake of this month's poor jobs report, quipped that it was more like a U-turn.", "targets": "Business"} +{"text": "What label best describes this news article?\nTechnology company sues five ex-employees A Marlborough-based technology company is suing five former employees, including three senior managers, for allegedly conspiring against their employer while working on opening a competing business.<|endoftext|>Business", "inputs": "What label best describes this news article?\nTechnology company sues five ex-employees A Marlborough-based technology company is suing five former employees, including three senior managers, for allegedly conspiring against their employer while working on opening a competing business.", "targets": "Business"} +{"text": "What label best describes this news article?\nGrant to aid Lynn Central Square Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGrant to aid Lynn Central Square Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.", "targets": "Business"} +{"text": "What label best describes this news article?\nState grant to aid Lynn; Bank gives Salem \\$10k Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.<|endoftext|>Business", "inputs": "What label best describes this news article?\nState grant to aid Lynn; Bank gives Salem \\$10k Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.", "targets": "Business"} +{"text": "What label best describes this news article?\nA New Legal Chapter for a 90's Flameout A lawsuit against Gary Winnick, the former chief of Global Crossing, refocuses attention on what Mr. Winnick knew about his company's finances as it imploded.<|endoftext|>Business", "inputs": "What label best describes this news article?\nA New Legal Chapter for a 90's Flameout A lawsuit against Gary Winnick, the former chief of Global Crossing, refocuses attention on what Mr. Winnick knew about his company's finances as it imploded.", "targets": "Business"} +{"text": "What label best describes this news article?\nWill Russia, the Oil Superpower, Flex Its Muscles? Russia is again emerging as a superpower - but the reason has less to do with nuclear weapons than with oil.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWill Russia, the Oil Superpower, Flex Its Muscles? Russia is again emerging as a superpower - but the reason has less to do with nuclear weapons than with oil.", "targets": "Business"} +{"text": "What label best describes this news article?\nSwitching Titles, if Not Gears, at Dell Kevin B. Rollins, the new chief executive of Dell, talks about Dell's transitory slip in customer service, and why he sees a broader technology recovery taking place.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSwitching Titles, if Not Gears, at Dell Kevin B. Rollins, the new chief executive of Dell, talks about Dell's transitory slip in customer service, and why he sees a broader technology recovery taking place.", "targets": "Business"} +{"text": "What label best describes this news article?\nFor Sale: The Ultimate Status Symbol With the country in need of cash and rich people dying to show off their wealth, Mr. Stein proposes a unique solution: having the government sell titles of nobility.<|endoftext|>Business", "inputs": "What label best describes this news article?\nFor Sale: The Ultimate Status Symbol With the country in need of cash and rich people dying to show off their wealth, Mr. Stein proposes a unique solution: having the government sell titles of nobility.", "targets": "Business"} +{"text": "What label best describes this news article?\nQuality Gets Swept Away Quality Distribution is hammered after reporting a large loss for the second quarter.<|endoftext|>Business", "inputs": "What label best describes this news article?\nQuality Gets Swept Away Quality Distribution is hammered after reporting a large loss for the second quarter.", "targets": "Business"} +{"text": "What label best describes this news article?\nMaking Your Insurer Pay If Hurricane Charley blows your house down, how can you make your insurance company pay?<|endoftext|>Business", "inputs": "What label best describes this news article?\nMaking Your Insurer Pay If Hurricane Charley blows your house down, how can you make your insurance company pay?", "targets": "Business"} +{"text": "What label best describes this news article?\nDelightful Dell The company's results show that it's not grim all over tech world. Just all of it that isn't Dell.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDelightful Dell The company's results show that it's not grim all over tech world. Just all of it that isn't Dell.", "targets": "Business"} +{"text": "What label best describes this news article?\nChrysler's Bling King After a tough year, Detroit's troubled carmaker is back -- thanks to a maverick designer and a car that is dazzling the hip-hop crowd<|endoftext|>Business", "inputs": "What label best describes this news article?\nChrysler's Bling King After a tough year, Detroit's troubled carmaker is back -- thanks to a maverick designer and a car that is dazzling the hip-hop crowd", "targets": "Business"} +{"text": "What label best describes this news article?\nWhat's Cool In the Pool ... ... And Hot On the Deck Americans are spending more on tricking out the places where they swim. Here's a look at the new wave of accessories<|endoftext|>Business", "inputs": "What label best describes this news article?\nWhat's Cool In the Pool ... ... And Hot On the Deck Americans are spending more on tricking out the places where they swim. Here's a look at the new wave of accessories", "targets": "Business"} +{"text": "What label best describes this news article?\nThe Age of Doom In 1993 six geeks had a digital nightmare that changed the culture. It's about to get far creepier<|endoftext|>Business", "inputs": "What label best describes this news article?\nThe Age of Doom In 1993 six geeks had a digital nightmare that changed the culture. It's about to get far creepier", "targets": "Business"} +{"text": "What label best describes this news article?\nHip Hop's Online Shop Celebrity fashion is booming. These webpreneurs are bringing it to main street<|endoftext|>Business", "inputs": "What label best describes this news article?\nHip Hop's Online Shop Celebrity fashion is booming. These webpreneurs are bringing it to main street", "targets": "Business"} +{"text": "What label best describes this news article?\nStoking the Steamroller No other recording artist can channel American middle-class tastes quite like Chip Davis and his best-selling band<|endoftext|>Business", "inputs": "What label best describes this news article?\nStoking the Steamroller No other recording artist can channel American middle-class tastes quite like Chip Davis and his best-selling band", "targets": "Business"} +{"text": "What label best describes this news article?\nComing to The Rescue Got a unique problem? Not to worry: you can find a financial planner for every specialized need<|endoftext|>Business", "inputs": "What label best describes this news article?\nComing to The Rescue Got a unique problem? Not to worry: you can find a financial planner for every specialized need", "targets": "Business"} +{"text": "What label best describes this news article?\nThe New Customers Are In Town Today's customers are increasingly demanding, in Asia as elsewhere in the world. Henry Astorga describes the complex reality faced by today's marketers, which includes much higher expectations than we have been used to. Today's customers want performance, and they want it now!<|endoftext|>Business", "inputs": "What label best describes this news article?\nThe New Customers Are In Town Today's customers are increasingly demanding, in Asia as elsewhere in the world. Henry Astorga describes the complex reality faced by today's marketers, which includes much higher expectations than we have been used to. Today's customers want performance, and they want it now!", "targets": "Business"} +{"text": "What label best describes this news article?\nBarrel of Monkeys, 2004 Edition: Notes on Philippine Elections Well, it's election time in the Republic of the Philippines, and that means the monkeys are rolling around in those political barrels, having as much fun as they can while laughing their heads off at the strange goings-on that characterize a democratic process loosely based on the American model but that de facto looks more like a Fellini movie crossed with a Tom and Jerry cartoon - column includes a useful election-year glossary!<|endoftext|>Business", "inputs": "What label best describes this news article?\nBarrel of Monkeys, 2004 Edition: Notes on Philippine Elections Well, it's election time in the Republic of the Philippines, and that means the monkeys are rolling around in those political barrels, having as much fun as they can while laughing their heads off at the strange goings-on that characterize a democratic process loosely based on the American model but that de facto looks more like a Fellini movie crossed with a Tom and Jerry cartoon - column includes a useful election-year glossary!", "targets": "Business"} +{"text": "What label best describes this news article?\nOldsmobile: The final parking lot Why General Motors dropped the Oldsmobile. The four brand paradoxes GM had to face - the name, the product, image re-positioning, and the consumer - all added up to a brand that had little hope of rebranding.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOldsmobile: The final parking lot Why General Motors dropped the Oldsmobile. The four brand paradoxes GM had to face - the name, the product, image re-positioning, and the consumer - all added up to a brand that had little hope of rebranding.", "targets": "Business"} +{"text": "What label best describes this news article?\nNot All Jobs Belong To The White Man: Asian Minorities, Affirmative Action, And The Quest For Parity At Work Although a smattering of Chinese, Filipinos, Japanese, Indians, Thais, and others may crow about seeing their kind sitting in prominent positions in corporations and organizations in the USA, these accomplishments become mere cultural high-fives and ritualistic chest-thumping goaded and impishly patronized by 'mainstream society' - the milder and gentler term for the white-dominated populace.<|endoftext|>Business", "inputs": "What label best describes this news article?\nNot All Jobs Belong To The White Man: Asian Minorities, Affirmative Action, And The Quest For Parity At Work Although a smattering of Chinese, Filipinos, Japanese, Indians, Thais, and others may crow about seeing their kind sitting in prominent positions in corporations and organizations in the USA, these accomplishments become mere cultural high-fives and ritualistic chest-thumping goaded and impishly patronized by 'mainstream society' - the milder and gentler term for the white-dominated populace.", "targets": "Business"} +{"text": "What label best describes this news article?\nDownhome Pinoy Blues, Intersecting Life Paths, and Heartbreak Songs The Blues is alive and well in the Philippines, as evidenced by this appreciation of the Pinoy Blues band 'Lampano Alley', penned by columnist Clarence Henderson as a counterpoint to his usual economics, business, and culture fare.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDownhome Pinoy Blues, Intersecting Life Paths, and Heartbreak Songs The Blues is alive and well in the Philippines, as evidenced by this appreciation of the Pinoy Blues band 'Lampano Alley', penned by columnist Clarence Henderson as a counterpoint to his usual economics, business, and culture fare.", "targets": "Business"} +{"text": "What label best describes this news article?\nThe Real Time Modern Manila Blues: Bill Monroe Meets Muddy Waters in the Orient Globalization does strange things to people. A day in the life of a Manila Philippines based business consultant - proving that you really CAN talk about Muddy Walters, bluegrass and work all on the same page...<|endoftext|>Business", "inputs": "What label best describes this news article?\nThe Real Time Modern Manila Blues: Bill Monroe Meets Muddy Waters in the Orient Globalization does strange things to people. A day in the life of a Manila Philippines based business consultant - proving that you really CAN talk about Muddy Walters, bluegrass and work all on the same page...", "targets": "Business"} +{"text": "What label best describes this news article?\nBest Asian Tourism Destinations The new APMF survey of the best Asian tourism destinations has just kicked off, but it's crowded at the top, with Chiang Mai in Thailand just leading from perennial favourites Hong Kong, Bangkok and Phuket in Thailand, and Bali in Indonesia. Be one of the first to vote and let us know your reasons.<|endoftext|>Business", "inputs": "What label best describes this news article?\nBest Asian Tourism Destinations The new APMF survey of the best Asian tourism destinations has just kicked off, but it's crowded at the top, with Chiang Mai in Thailand just leading from perennial favourites Hong Kong, Bangkok and Phuket in Thailand, and Bali in Indonesia. Be one of the first to vote and let us know your reasons.", "targets": "Business"} +{"text": "What label best describes this news article?\nWhat are the best cities for business in Asia? One of our new categories in the APMF Sense of Place survey is for best Asian business city. After a couple of days, Singapore leads the pack, followed by Bangkok, Thailand and Hong Kong. Enter your vote and comments and make your views count. More new categories include best city for livability, and best tourism destinations.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWhat are the best cities for business in Asia? One of our new categories in the APMF Sense of Place survey is for best Asian business city. After a couple of days, Singapore leads the pack, followed by Bangkok, Thailand and Hong Kong. Enter your vote and comments and make your views count. More new categories include best city for livability, and best tourism destinations.", "targets": "Business"} +{"text": "What label best describes this news article?\nIT alligator tales I grew up in New York, where giant alligators -- sometimes more ornately described as albino alligators -- were rumored to roam the citys sewer systems. According to legend, vacationers picked up the tiny crocodilians in Florida, brought them home to New York, and eventually flushed the little buggers when they grew too big for the local concrete jungle.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIT alligator tales I grew up in New York, where giant alligators -- sometimes more ornately described as albino alligators -- were rumored to roam the citys sewer systems. According to legend, vacationers picked up the tiny crocodilians in Florida, brought them home to New York, and eventually flushed the little buggers when they grew too big for the local concrete jungle.", "targets": "Business"} +{"text": "What label best describes this news article?\nIT Myth 5: Most IT projects fail Do most IT projects fail? Some point to the number of giant consultancies such as IBM Global Services, Capgemini, and Sapient, who feed off bad experiences encountered by enterprises. Sapient is a company founded on the realization that IT projects are not successful, says Sapient CTO Ben Gaucherin.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIT Myth 5: Most IT projects fail Do most IT projects fail? Some point to the number of giant consultancies such as IBM Global Services, Capgemini, and Sapient, who feed off bad experiences encountered by enterprises. Sapient is a company founded on the realization that IT projects are not successful, says Sapient CTO Ben Gaucherin.", "targets": "Business"} +{"text": "What label best describes this news article?\nBEA grabs CA exec to head product group BEA Systems Inc. has hired the Computer Associates International Inc. executive responsible for CA's Unicenter line of enterprise management software to head BEA's product development group.<|endoftext|>Business", "inputs": "What label best describes this news article?\nBEA grabs CA exec to head product group BEA Systems Inc. has hired the Computer Associates International Inc. executive responsible for CA's Unicenter line of enterprise management software to head BEA's product development group.", "targets": "Business"} +{"text": "What label best describes this news article?\nAutodesk tackles project collaboration Autodesk this week unwrapped an updated version of its hosted project collaboration service targeted at the construction and manufacturing industries. Autodesk Buzzsaw lets multiple, dispersed project participants -- including building owners, developers, architects, construction teams, and facility managers -- share and manage data throughout the life of a project, according to Autodesk officials.<|endoftext|>Business", "inputs": "What label best describes this news article?\nAutodesk tackles project collaboration Autodesk this week unwrapped an updated version of its hosted project collaboration service targeted at the construction and manufacturing industries. Autodesk Buzzsaw lets multiple, dispersed project participants -- including building owners, developers, architects, construction teams, and facility managers -- share and manage data throughout the life of a project, according to Autodesk officials.", "targets": "Business"} +{"text": "What label best describes this news article?\nU.K.'s NHS taps Gartner to help plan \\$9B IT overhaul LONDON -- The U.K.'s National Health Service (NHS) has tapped IT researcher Gartner Inc. to provide market intelligence services as the health organization forges ahead with a mammoth, 5 billion (\\$9.2 billion) project to upgrade its information technology infrastructure.<|endoftext|>Business", "inputs": "What label best describes this news article?\nU.K.'s NHS taps Gartner to help plan \\$9B IT overhaul LONDON -- The U.K.'s National Health Service (NHS) has tapped IT researcher Gartner Inc. to provide market intelligence services as the health organization forges ahead with a mammoth, 5 billion (\\$9.2 billion) project to upgrade its information technology infrastructure.", "targets": "Business"} +{"text": "What label best describes this news article?\nPlay Boys: Google IPO a Go Anyway Even though Google's two founders gave an interview to Playboy magazine in the midst of its IPO filing, the SEC allowed the company's offering to go ahead. The boys filed the interview with the SEC and corrected mistakes in it.<|endoftext|>Business", "inputs": "What label best describes this news article?\nPlay Boys: Google IPO a Go Anyway Even though Google's two founders gave an interview to Playboy magazine in the midst of its IPO filing, the SEC allowed the company's offering to go ahead. The boys filed the interview with the SEC and corrected mistakes in it.", "targets": "Business"} +{"text": "What label best describes this news article?\nMore Big Boobs in Playboy An interview with Google's co-founders due out in the current issue of Playboy may delay the company's IPO. Securities regulations restrict what executives can say while preparing to sell stock for the first time.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMore Big Boobs in Playboy An interview with Google's co-founders due out in the current issue of Playboy may delay the company's IPO. Securities regulations restrict what executives can say while preparing to sell stock for the first time.", "targets": "Business"} +{"text": "What label best describes this news article?\nDutch Firm Beats Apple to Punch A music retailer from the Netherlands beats Apple by launching a download service in Europe's latest market battleground. Also: Movie industry wrests agreement from defunct company.... Microsoft challenges Photoshop hellip;. and more.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDutch Firm Beats Apple to Punch A music retailer from the Netherlands beats Apple by launching a download service in Europe's latest market battleground. Also: Movie industry wrests agreement from defunct company.... Microsoft challenges Photoshop hellip;. and more.", "targets": "Business"} +{"text": "What label best describes this news article?\nHP to Buy Synstar Hewlett-Packard will pay \\$297 million for the British company. Also: TiVo goes all out to attract customers hellip;. Sprint offers service guarantees for business wireless subscribers hellip;. and more.<|endoftext|>Business", "inputs": "What label best describes this news article?\nHP to Buy Synstar Hewlett-Packard will pay \\$297 million for the British company. Also: TiVo goes all out to attract customers hellip;. Sprint offers service guarantees for business wireless subscribers hellip;. and more.", "targets": "Business"} +{"text": "What label best describes this news article?\nA Personal Operator From Verizon Verizon plans to offer a service that would act as a virtual switchboard operator, letting customers stay in touch at all times. The program would send phone calls, voicemails and e-mails wherever customers designate. By Elisa Batista.<|endoftext|>Business", "inputs": "What label best describes this news article?\nA Personal Operator From Verizon Verizon plans to offer a service that would act as a virtual switchboard operator, letting customers stay in touch at all times. The program would send phone calls, voicemails and e-mails wherever customers designate. By Elisa Batista.", "targets": "Business"} +{"text": "What label best describes this news article?\nPaid Search Growth May Slow A new Internet advertising forecast shows a slowdown in paid search listings in the next five years. Will the projection affect Google's prospects when it goes public?<|endoftext|>Business", "inputs": "What label best describes this news article?\nPaid Search Growth May Slow A new Internet advertising forecast shows a slowdown in paid search listings in the next five years. Will the projection affect Google's prospects when it goes public?", "targets": "Business"} +{"text": "What label best describes this news article?\nFark Sells Out. France Surrenders Blogs are the hottest thing on the Net, but are they messing with traditional publishing principles? One of the most popular, Fark.com, is allegedly selling links. Is it the wave of the future? By Daniel Terdiman.<|endoftext|>Business", "inputs": "What label best describes this news article?\nFark Sells Out. France Surrenders Blogs are the hottest thing on the Net, but are they messing with traditional publishing principles? One of the most popular, Fark.com, is allegedly selling links. Is it the wave of the future? By Daniel Terdiman.", "targets": "Business"} +{"text": "What label best describes this news article?\n'Madden,' 'ESPN' Football Score in Different Ways (Reuters) Reuters - Was absenteeism a little high\\on Tuesday among the guys at the office? EA Sports would like\\to think it was because \"Madden NFL 2005\" came out that day,\\and some fans of the football simulation are rabid enough to\\take a sick day to play it.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\n'Madden,' 'ESPN' Football Score in Different Ways (Reuters) Reuters - Was absenteeism a little high\\on Tuesday among the guys at the office? EA Sports would like\\to think it was because \"Madden NFL 2005\" came out that day,\\and some fans of the football simulation are rabid enough to\\take a sick day to play it.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nGroup to Propose New High-Speed Wireless Format (Reuters) Reuters - A group of technology companies\\including Texas Instruments Inc. (TXN.N), STMicroelectronics\\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\\will propose a new wireless networking standard up to 10 times\\the speed of the current generation.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nGroup to Propose New High-Speed Wireless Format (Reuters) Reuters - A group of technology companies\\including Texas Instruments Inc. (TXN.N), STMicroelectronics\\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\\will propose a new wireless networking standard up to 10 times\\the speed of the current generation.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nAOL to Sell Cheap PCs to Minorities and Seniors (Reuters) Reuters - America Online on Thursday said it\\plans to sell a low-priced PC targeting low-income and minority\\households who agree to sign up for a year of dialup Internet\\service.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nAOL to Sell Cheap PCs to Minorities and Seniors (Reuters) Reuters - America Online on Thursday said it\\plans to sell a low-priced PC targeting low-income and minority\\households who agree to sign up for a year of dialup Internet\\service.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nCompanies Approve New High-Capacity Disc Format (Reuters) Reuters - A group of consumer electronics\\makers said on Wednesday they approved the format for a new\\generation of discs that can store five times the data of DVDs\\at the same cost -- enough to put a full season of \"The\\Sopranos\" on one disc.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nCompanies Approve New High-Capacity Disc Format (Reuters) Reuters - A group of consumer electronics\\makers said on Wednesday they approved the format for a new\\generation of discs that can store five times the data of DVDs\\at the same cost -- enough to put a full season of \"The\\Sopranos\" on one disc.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nMissing June Deals Slow to Return for Software Cos. (Reuters) Reuters - The mystery of what went wrong for the\\software industry in late June when sales stalled at more than\\20 brand-name companies is not even close to being solved\\although the third quarter is nearly halfway over.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nMissing June Deals Slow to Return for Software Cos. (Reuters) Reuters - The mystery of what went wrong for the\\software industry in late June when sales stalled at more than\\20 brand-name companies is not even close to being solved\\although the third quarter is nearly halfway over.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nHacker Cracks Apple's Streaming Technology (AP) AP - The Norwegian hacker famed for developing DVD encryption-cracking software has apparently struck again #151; this time breaking the locks on Apple Computer Inc.'s wireless music streaming technology.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nHacker Cracks Apple's Streaming Technology (AP) AP - The Norwegian hacker famed for developing DVD encryption-cracking software has apparently struck again #151; this time breaking the locks on Apple Computer Inc.'s wireless music streaming technology.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nEuropean Download Services Go Mobile (Reuters) Reuters - The ability to download complete\\tracks directly over cell-phone networks to mobile phones is\\becoming a reality in Europe.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nEuropean Download Services Go Mobile (Reuters) Reuters - The ability to download complete\\tracks directly over cell-phone networks to mobile phones is\\becoming a reality in Europe.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nOpen Source Apps Developer SugarCRM Releases Sugar.Sales 1.1 (TechWeb) TechWeb - News - August 13, 2004<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nOpen Source Apps Developer SugarCRM Releases Sugar.Sales 1.1 (TechWeb) TechWeb - News - August 13, 2004", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nOracle Sales Data Seen Being Released (Reuters) Reuters - Oracle Corp. sales documents\\detailing highly confidential information, such as which\\companies receive discounts on Oracle's business software\\products and the size of the discounts, are likely to be made\\public, a federal judge said on Friday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nOracle Sales Data Seen Being Released (Reuters) Reuters - Oracle Corp. sales documents\\detailing highly confidential information, such as which\\companies receive discounts on Oracle's business software\\products and the size of the discounts, are likely to be made\\public, a federal judge said on Friday.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nSun's Looking Glass Provides 3D View (PC World) PC World - Developers get early code for new operating system 'skin' still being crafted.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nSun's Looking Glass Provides 3D View (PC World) PC World - Developers get early code for new operating system 'skin' still being crafted.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nApple to open second Japanese retail store this month (MacCentral) MacCentral - Apple Computer Inc. will open its second Japanese retail store later this month in the western Japanese city of Osaka, it said Thursday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nApple to open second Japanese retail store this month (MacCentral) MacCentral - Apple Computer Inc. will open its second Japanese retail store later this month in the western Japanese city of Osaka, it said Thursday.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nCharley's Force Took Experts by Surprise (AP) AP - Hurricane Charley's 145-mph force took forecasters by surprise and showed just how shaky a science it still is to predict a storm's intensity #151; even with all the latest satellite and radar technology.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nCharley's Force Took Experts by Surprise (AP) AP - Hurricane Charley's 145-mph force took forecasters by surprise and showed just how shaky a science it still is to predict a storm's intensity #151; even with all the latest satellite and radar technology.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nScience, Politics Collide in Election Year (AP) AP - With more than 4,000 scientists, including 48 Nobel Prize winners, having signed a statement opposing the Bush administration's use of scientific advice, this election year is seeing a new development in the uneasy relationship between science and politics.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nScience, Politics Collide in Election Year (AP) AP - With more than 4,000 scientists, including 48 Nobel Prize winners, having signed a statement opposing the Bush administration's use of scientific advice, this election year is seeing a new development in the uneasy relationship between science and politics.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nBuilding Dedicated to Columbia Astronauts (AP) AP - A former dormitory converted to classrooms at the Pensacola Naval Air Station was dedicated Friday to two Columbia astronauts who were among the seven who died in the shuttle disaster Feb. 1, 2003.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nBuilding Dedicated to Columbia Astronauts (AP) AP - A former dormitory converted to classrooms at the Pensacola Naval Air Station was dedicated Friday to two Columbia astronauts who were among the seven who died in the shuttle disaster Feb. 1, 2003.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nRussian Cargo Craft Docks at Space Station (AP) AP - A Russian cargo ship docked with the international space station Saturday, bringing food, water, fuel and other items to the two-man Russian-American crew, a space official said.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nRussian Cargo Craft Docks at Space Station (AP) AP - A Russian cargo ship docked with the international space station Saturday, bringing food, water, fuel and other items to the two-man Russian-American crew, a space official said.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nBangkok's Canals Losing to Urban Sprawl (AP) AP - Along the banks of the canal, women in rowboats grill fish and sell fresh bananas. Families eat on floating pavilions, rocked gently by waves from passing boats.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nBangkok's Canals Losing to Urban Sprawl (AP) AP - Along the banks of the canal, women in rowboats grill fish and sell fresh bananas. Families eat on floating pavilions, rocked gently by waves from passing boats.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nT. Rex Had Teen Growth Spurt, Scientists Say (Reuters) Reuters - Tyrannosaurus Rex grew incredibly fast\\during a teenaged growth spurt that saw the dinosaur expand its\\bulk by six times, but the fearsome beasts \"lived fast and died\\young,\" researchers said on Wednesday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nT. Rex Had Teen Growth Spurt, Scientists Say (Reuters) Reuters - Tyrannosaurus Rex grew incredibly fast\\during a teenaged growth spurt that saw the dinosaur expand its\\bulk by six times, but the fearsome beasts \"lived fast and died\\young,\" researchers said on Wednesday.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nGene Blocker Turns Monkeys Into Workaholics - Study (Reuters) Reuters - Procrastinating monkeys were turned\\into workaholics using a gene treatment to block a key brain\\compound, U.S. researchers reported on Wednesday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nGene Blocker Turns Monkeys Into Workaholics - Study (Reuters) Reuters - Procrastinating monkeys were turned\\into workaholics using a gene treatment to block a key brain\\compound, U.S. researchers reported on Wednesday.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nDolphins Too Have Born Socialites (Reuters) Reuters - Some people are born to be the life and\\soul of the party -- and so it seems are some dolphins.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nDolphins Too Have Born Socialites (Reuters) Reuters - Some people are born to be the life and\\soul of the party -- and so it seems are some dolphins.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nWhat's in a Name? Well, Matt Is Sexier Than Paul (Reuters) Reuters - As Shakespeare said, a rose by any other\\name would smell as sweet. Right?<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nWhat's in a Name? Well, Matt Is Sexier Than Paul (Reuters) Reuters - As Shakespeare said, a rose by any other\\name would smell as sweet. Right?", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nUK Scientists Allowed to Clone Human Embryos (Reuters) Reuters - British scientists said on Wednesday\\they had received permission to clone human embryos for medical\\research, in what they believe to be the first such license to\\be granted in Europe.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nUK Scientists Allowed to Clone Human Embryos (Reuters) Reuters - British scientists said on Wednesday\\they had received permission to clone human embryos for medical\\research, in what they believe to be the first such license to\\be granted in Europe.", "targets": "Science and technology"} +{"text": "What label best describes this news article?\nRussian Alien Spaceship Claims Raise Eyebrows, Skepticism (SPACE.com) SPACE.com - An expedition of Russian researchers claims to have found evidence that an \\ alien spaceship had something to do with a huge explosion over Siberia in 1908. \\ Experts in asteroids and comets have long said the massive blast was caused \\ by a space rock.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nRussian Alien Spaceship Claims Raise Eyebrows, Skepticism (SPACE.com) SPACE.com - An expedition of Russian researchers claims to have found evidence that an \\ alien spaceship had something to do with a huge explosion over Siberia in 1908. \\ Experts in asteroids and comets have long said the massive blast was caused \\ by a space rock.", "targets": "Science and technology"} diff --git a/tests/data/t0/ag_news_prompt_text_document.bin b/tests/data/t0/ag_news_prompt_text_document.bin new file mode 100644 index 0000000000000000000000000000000000000000..49b142d487d7ad6663aa0abaace9906c18aa7e79 GIT binary patch literal 18494 zcmdsfd3+RA*7iO3)~%|ps;*uVf(ao@vt!s2AVN0QK-eNpfI!$u*aQ)kO~3#%Ci8)&F|TgKTehus@ILPX=V9O_R+G(o>XIIJJ*K=Bl!B7K$r zCCXv#-B$ECKT8McTAoTeZFhaP&5qDb{0~|t`GYxZt~d@k3TU%7o2TH4X-2FoYR(4Z zjjQP^S<1c<$1Ib7@969Uw_TTV*i92RDOq}%yBR7`#GU6{qX zg2wC#cQPf>aSp0WQ#x9}Xc z+?ua%fJ_r*?Z|zpWa1fJeEadcQhuLzqf6A=`yhQ|r1671^Su)L`V%atjqL;)}X<>y8us1fl&mcVMn5~`_N40n+m1#?j z>xGWz?g+l2|46Y)d_nu(SxsYTfn&4jVj1Kp;4i>8=Ig)Rab;IS0&Z}jr-1Hqa+nzy zV#U*bnSrZ|u;*j=Xzbmdz^mej(Oa)(wPk^o#Gdfxunx@iT<5V_E^lYe6hxK2sFiJ? zT(>SY#Ix$L-F%+c%POcItIvM6y{tL*CpOQ{V2kApx<`cB4B8EEyO%t)m<&-ZD(FwP z$<}(~`Q4^ZQ#Sj$6Q(ak8L!Z%8IJ@G^8_Qz?sTfCMruE7I6>cKE1#Y1l`{FCzp2f} zLAndRFX%eWMl@^6EB%j?<#f^BHHW!zJj;z$2Mt&}ReBbQeux82X$oJa^<@32f@a#Y z7>cLu`ar5azBTL(W_*IwcdzANH5D$x3CC_97jJxge-U=FJbYwwTpOSR2 zR`Y#4|JD>59E^rhBkX8B{7Z#bzob6^sm9R}trYpe&<4{@)Pa|>I5U~n!iseI*c=s{ z1?h4FlRU**ynYLP?VCl3bU_~G$E|DKmWY)#x|(3xEFZGpDkbgbHbYEy=F2nU0pI(w zir%Kb^B(Z1t^Qm-T8^S^BE)SiDd(br zQj2=iU0O5xQf{De{4Jvize$Tq8|!ZUr;J$5mQj7=%mUs_ZwfD{$&bI68LJuj|F1IR zf662OUcTSvlWG=1rug^Sq?)DEzt1Mstjzb{XOac>GSWOnw1---N@T8K^ss2-IIJ^x0cr`M9eb$JzVs(EN;o^Dp`a|>%jfM@TQEP1~56~z|fR(A3Hi0*gK5YR>eG2dE znS&geBC2QtET=a6L{>-QTbOm@1GEa*PciF^=#@e$&)V!`U-%b!I%)xmgA83^XRF{B zg~%2Md6Bt=>d;6tR(op3By!9YrxI%(3~Pl>%$)Qw1umeaOTU#!+8B+s*}2+{n97w0?NP(RmbkG?e+2A|JJi zsFAND>mwC@P?*^xzDLG8PR-;RAd?s2qg~m0IwWhb$3{A+C(|dI%H);e2QgBP<~z6} zKBl@_Co5l0g3Z>Vp;XL&qHC$M98R@j_PUvN)M`%2-9Q@3b}E4zqn{n2P1DhFYHy0uf-+^8EksP~jwnGuMrodHF{6e!ZxhK>KptC?+_ z5dC7tT)&P$U+Y2oo7<7S3k*~jR`Z~!q&OO*C(sM_-}rf<;!BvK@x*2)+|8)-a{Q3NUVmnV%k=f}lrsN z9&Am0uRa?k6goRD!`}CSI=A{$X{4tt7@~W9x6r9TtZ!?@7E+O^s^2a2A|VFiPw~oV?1Zq(*uQH4&xSKy4AC>3;2J;ic{3PA8hJRW;mOUv6Xo zIjXAcRdL!mLUA#_HIp%Yd|ZEHB2AHVeD8b4$Au#+Jc^1$eNqJvi1+DU8Nfp_hVgc)B~a!w2t4esZ5iM_@gpv3iR8fq@rHV zBCqJf1TyYungUBuQKG<75yEEsET2@*h*ge8Kr8BPj8KIjhh3B|IHbw61~ub}j04^nmu zxNd*#s2-xD#t?B2O^BIGSYULai@ccsLu@j;z{0Mf4b+}8z#NQ3-WiQdp#So-ijmzt zepth8vWR{*_H)T6aSPcuU#Po-xO1StA5-kqUTqw(W>?l5k;;NGWz(CGKUSSJVK-V^ z0$=;O1;XqgJ&Y_oSKsT=X++$XfE7GtDw|#9Cef4puRI@eZ8ui&<`m#-U^x%F6&rAw8iW0fkCHS=|6D*|cAnoM3qmSl9Vo=r8M-lUbl zHr9KS>4QKbG%%umc z7i{I3{z&f5pn+x{Jt~*lDuTX;7+ar(!4~$zo+W`HxeC&K>%V+`1;w+5Rxi{q1?16G zF4)hn(V6&w*QFo5%vtGu4y?%KHO9JUjoBEo^;$F*(YlNm1~2MoXuLPcxCuXb#naI4 z1!D<3tBErama`W1g-&y=PxRL!jNoADh!j}Y{od%EvA#W%t>oVZM$^Z17)WxH(SSA6 z>#>nCo922iiCMIaTxta`izmfK4Yx1RNUZ~H@CeivACZ^7qSjU*km8G{&d}v_kwLn= zL@Vqg)Dx0Q?9n~qM|rh%HfB`xYebTWD)kZU@;=^2>&muzpZD$KIUbF63Z1nFA`FY; z&Nv_rl>u~H#l{rGY9+DC1-Y)OY{C&V6^-W(YevmHH8qi!HI@hYQ`&d9I}6v}FE;6m z@U@A#Nso0u(^wq!7q?S7-AyAYOwIK!6cl|alTSpImo> z^$*fy@RlKJY<+{;Vm`2&mltTO^bgG8w9QPQVO;svwG;x9jV<0(#JxbTdoMj0=`4fov5HYeo(iT?H|j{Y z>+K=yPqdL*d0(-C^HB>0-(WL% znDS924C95r$fBwmoyG>j^sY3B$BY3o*o&U&{-~Ua%kZ&8M5Ug(5y5kFeI;I^IiAh# zA!=Y=#P2J7D`-2Pij|KPV}aGL)~m!4n!x+fkGw0J$>ZrSf$9FP;yChCKe}6tlO?dR zM6E zZ5!;V7vCUT!CsEicyKy%w6&1M)ibf*8Po<$T`_wc{fN(v8F?Sx*#>c6c$P-q zutK)1=|7}*vM0i9jHi}Q!V6Ig-L8!eJ8fTpB{Y5KMlN6DezGt zz2gTW<2q~2+OfCL*U6)%$p207?1TK<2#$M}C*bb8426@UQVsW%Sh0CYCTk3w_L#gV zS_7?h1opnDP4>THl+h1TMa0j|LcBxuJdPli>A)XP!$x$vLpy``7=3HQz>eV40mIW> zp3rANuRnuD$b#O~o>)`|Sad7Brm4M|;5&ireG{MLE2I*difG#lHOPDLM)llSJv@tb zVO`||a<2X;1!=kQCp_l{S|Ss{CFRk1*^{-hufyJkoD+zq`Sd8bz~#m^qaFG)*UK{P znqXUx>QxredVePUz>8VASJN&<_~fJFufP;5g9IB)puCu)YD4s+*1?Ju>bsXN8NJzF zV<>XeBk1{D{!Vo`)bCn|ETBHc9E=pt0a?8w6sxrc*}WM1*hzaB%tI}D4p@Gs9tKih zZ!DDOuo@LP*V$X_+r$F&Cogza4`&74Lg&S3*QZde_95;kI%vEULBAKQTKIW*1ohWu z=h6Mh7pb&HbF_8fb!veTh{~hzZ>-w5pxr|1Ilo?W5_$vsMQ!)%;0=*?ZWTR#wGaP4 ztv(Gp`@1)ZM)C@1fk^tp)I&dsu52Fl)cSf{(zF%SlHW)x-5a0-owgY-qa!}qa;$Fj ztfqAN53!!cS(}^!`Xm1w78kvCyB*>R;f5o8+-Ues(64x&SoPDEjin>VUphFz!E`lP zf?Dv!8I)`xE3vLt8y=(-cEri2Z~0K?I%_mmQDv&D;<>8R4}yWqM;GlK8ctgR``k&0 z^k-tu%IG(-_rTw^0ZPlm`$zL~V+H05RPSaSdVW9C0Ye}HBeu{^?vM267V(#TX~@ay z_cr^?Ddr)rz7Nw^!lfk<&L@>_Hw`KS#}uNkWj~sWh(Hu}3_4*)+0^$;U(7Z=%E+eE zB9T=hMxFG`HB9b7CK;iBVhp7;Q?V$(x`=OHwk3}MvU!rm<#0r!y z7kLET;#V@QcX=?8DJJy|ZK%s4`;dY>(UDD#8F98`hxuA~yid!fYW^y0?Y!8csm~7k zGr^{&Bck_W8QOk+LL}hbt@X!k6(ja*@iAAk8LV9N@iv96e1*I@4CwiDF&Uk;d$b?r z1gBhoiqF>*_(XFQwE7%BM1v?!Bt-gjbM@O~N{r~O9(#};qq~ICv#L$Q^f>yfDrsf-OCa^Cy+@?kTrXo??&t-R_|C~WMO5;@Sc45;#BKflYm^W<71Ss z23tW(Vfr8N!6!wL){8ZSmz?Ay!RZ!9*w(rcM*Rl<1~6O*`&^_IqH0V-Uvv;F29uqR zNd^ZK7%SY$VwG4uvG483g#|I~3Es`S0KA>#1z_8503!Gf z`VIJ-f_FY8JHU^IV+x0;egvs3;ud{}>T9fVRF6%Lw_saGvByKr#^{%~*MsaCvo-x$ z7W0P4;C9?`y%(zmR6Ep*S`W6RkZuTjvzS|QjF!Zo6S2Lo5v$85(@N~Rs}GjT#7gW! z7uEn6d@We?Hd-U>@7ppP^^)ni&zIyoz?&KiA{B$es}p7CU~KQZp#2s3R0sa+NV`1- z&5+&seO!~rv?X*1mC8_SRGL7+$6RdQI5w(tX5pkmuRgP#ypIw0A3P<#bLS=J}&XR7G`&7tGBGuPs@8`cTY#2Ni!mBV^{qtOR?UC zbQ|^8`Y>{$2v|+K26UggDARdRKpnkiP` zW-W+ZlffFoulF1Ah)8QAlPP;p70t4n1AX6NEJXxTyNd3eWne1s1T{k4sW9Q{Kp{2c zDRQ`-g4*YHeIfiU)+jy5T_^WZ6(aRG-w(7{ucV$(op?IdRf$u!~Bw1EfWyW*1MIv)jX~743Bsc z{rk`8QGT1)sh=l}zBJECsiFkSiC_hn4s(xHfw>fiEys*Q0{E;hk;%hr#iOv8$7Q;l zu2rGx>cwt{=e;ORSI4B8j(MNQxXLrF@&03an+WPp0-t`px4mzdoJq}NuB7U*zPuZI z0~topyF6dpfqk6k?@Dc-+r$0f2Hp|n`W~k?Js629(R%zF zQHyWWCetpu5#0)+O>UAu8S$3$!+eXbrbzRks}OQdEmT&q+S6N9S0qt_ugE)1>un51 z7FX*@1WVG(ZiRfa2(xB=X^_8^-3Xp_BA74b9ka3T3b&8a2MOi*$AS0m!MpRo2CFM} zf)j@SN1j)+8#9bN{RiHIuY*tg0lF>(KI*4+qq^v(hj`69lE7KOX8kvtnXsp|@SXxZ z?=7*|b1cUEna&m?lHZRQe}IDM7%aw)4+fH2#MgRN2SW9_lqSw<>Z(cpp!TAvt~f%= z_=#XV<-&K4N6^-Be>|xeR|H=lj{2rM=1BX}wMHsk#mo3YT)!2wp0z2P_h!A>S!!w? zpzYRV*vHTKXQKjK2oCPAu#a-N-pa)uM(&BGm8Eyq3aL7w93rh_CS<#b=0Mk=@9qme5&8%w(#4 z$;T{kbYJrHBf#zo3--fqss4X9JmMVPN_mL=I<-O{y&V0$nl-AZq4p$1HS71X?~IX@ zz|~qhgT+ypID{JhDdg2QPE?L_=2`6s<}WkoMKaJu2;uFQH4>{$|7e_`?$li}`rc@d z7!fcOWBZEOfhRWw%X7e1zdvn^lxbec7g2Af%R%1J@ZpZE;Ge&%$ETs2*B%&T8}EpH z0=LEHVwM||cEDVZ1(WG5Z5OnAKlPyg@QfeKc)Ez|7#;O`!M}H6(-Bo|?D4{&%_cd+ zct6yRQ7J++-0v0h=y}Al6xl>f&?bU`{5T@(Qtc~ERVa@bi*$`v@?$i{Y!Gvv_|Z6S zEC-vM$PW72Qx;nqIByI!zvMOFr?8`z>cxTsgx=MjDBpZkdzx-Wl!v$%g*t;nSb!K{uB-F;yRZojHPMAXXo9leC4HjEP>O_~p5JoE9JP;$m%- zUWjKaY0&GZ`e31MLfxjG)(xLFXG6fvw%|XZ{*6`V>F5ZhF-Kd9m<)`^R(jBP(e~(% z$}-H#rLp+_x1GP_Q=g{=?+O$FVW*+5R9n7P6?M4xOQ3HC=-5&g89Z6ztM<^$L^ zdS50|Esx+Ufz?*=+L%zTxw_1J-V=>R)$*W)4uo%vc-*`bK3emfGP*7pW-su)=v#aM zw&R-s&L-gga{FOzq`Y2cTTQUzb?;`xRA z3=NPI+?tp^1i41d&D6ZjiJAXs4Q9A0Q*&<1Fi(rg0dyO|$kQ2xlC}9#b@8sk`3Geu zQ;?J2_1sIjvL2@BMoPs()ZE>|)5iB|u!5Rl5;7B=z9aCw3>hD>uf4vXjOa6!Zj-6p zoM!Y1guv+NbV+XV710vsm`|b1x)FY$FLKU&zRRv4Ow!}+m;q+HFl{{qor;Tev$>A1 z;!WfUdzn@m;oaA54 z;HW5F+i3>eunPXG<_!-<*fJfSSZNnJs5eoyt4YCRAgE)=6O}YovE=YObdE9E6TLD- zExqxy(^rlhT@tg0Ww2W*EAW`3uxHJLP_m|;u4Mla$X|6&CeoAUcM+7ZL)M1%MrV>f z<^A-|kV45BU4fSkHlmY>nELepk+*Vkv_o{KI4Zh1d$Eq8?q0u&g0J(rF=Lu~@Uu~v zTs-CJDJx_-(9$sa#>u0BmYS9~FiD`JTYEDN4`h-0eHG2}D{kl`f0>y|U!j(p?VV=C zQCsaV@-*JwDUiteV(#`I%nHYIKI0 z08uBwl0xA4W3?7FPm&CK$qSCqd!z4J2if#rXGuQ7oP_d}jYa}`@oJ}@@y!gG~cv=G|gJFcExMo^DYyz5_kkg|e-f_*SEx3@f0&<=?->WYU&!u28Fl4M@v47_ zl}c^hSzx;6U~QFjuWZG)xy&(9v)m87T#s&`)0j_DUCmzXwBA(oLN)(}yam?%tmRrh z;Mp?Z)+P8Fs}5SQhxBY3?yf-Hvz%VlZ}oOU)LX>I1{lvVw&{21OReJ`qI72fc!F`> zKil1Y>4GImHBp6OAavM<=#6Ner95Oll_3XB;3YK4zFUix4Q#+xqXVaA9zQ^bG!Qq& zuhAHwj(p9)XA0+~)5qFkO!zCjJRXt!I4wZ6-T)os;q*U&=gkao&Xy3UKnguk6sXge z{5~)dGiWs|G=Lb0IS=%K=7Pn^pf!l>v$YaDEy|ebbQqk>v(`9T?mS4R+|D>97W>qV z&N5hr?U0%Vu=EzJ8TyMeh_o@r26-#u#yC2`v-zLtMw~UP_~!|NniF<6QhjRaX$saY z2!0_2)npF%hOypaOuMaso#@m8=k``3i)YhXw-tJ8IheAYEmWt?5!a(~SMOd%Q`{{g zo)H_Qe}_6`0I*CBrufG5laY$x2mk+w*YyQpvzMX=)fmwz2i>_kVEIh&ivv*WybLY= z)}Aui9pEfZ*#GX-UPJaECYS?SrS}z~gGtL|e?exxG2(eMs56-L->z6qy`pZxH5x#fLzN|1ZDN%tOnTtP@Hv0Q_IEuk26mxLS?k+78 z{Dn?C`nN;vgLT|9L@BH5$U4LMTpQrWoxdW_t#PQ4~AtcDN7*bXY@Vf z8H#go{V|1{q*b~z{m-LIseZcKcmvf(fqyNodm%UeI*a+v_|6&axmhSb_({$EoMJ<5ol{Q1{9Erd zveb9r7uU!EK(5!*Ti5~23Q>LQAyD}R6yz^mVY2i0tTuyv0jvAMV%h|uVqh}e8uv6W zqFd!NyrJF=oRK>5R&&;;##1YC&Sud5C!GF!uRsa0**}6(ZhX6WBh3tNw4xDb#OrWSD+6BZF-Sc&3 z7QY=VRwwkgpF}Nu2y+3sh??!~Coq+u&U1yNpCtj!&*htBxzOnWtr$8k_eX1v=oycw zZw<8m4CHa*3K;tL)}rRD|AP3dCi%CTs!ID@_Qh=5l{BGFP<-TXZRD^=%yWU~ohf{d z<*+d{VXOg)Z};grsjiO3$8rAAaXC2ul#7|A1bWl= zt@tZHOHCk&s*Y1(tpQQYR0Lq&m$VVczy%b&KMX8&$jxC(^lVH=Di-)Gn0oNzk*m~+ z?;)`3ELwy3T+E7)bBWei7d%ICrs~R*$G^AEOm;Vg!BMt_q?4^o9?<%;osxJxb`jRl z0&!`wX|kFA$>7kHMkb;oHQBfgrxF%n2DqNMM%xGZdmy{$>_bt4`S8AAA7j^i*I)z0 z1}67wBjOa!?&PYOmkQ+E9?*VM8Y)eu`J>;(^UL1g#Y`fTtN7*n(sXsFs(RFP?})e~ zh-Q0GrM?anr+OJzKA-w~`?SEQL*=MH25Y70y|?8rNH1RwUUrOMRrr@9z^^;HVLBk= znZlNz^Ci|3zUvU z#sLYf04^A)&o|Q|eYSMT;X#+zk%xJaH0-IGcbG!Ga9%!{rm2ZKN8!F<*ykRYyX*i~ zVJuIgsjkWU1NAiyuJGba0BY;#lQHi|(I@{z^|-=~t_ysCQ)oJM3-m*OwIA=S55Nh4 z5m@mS5xhd(@Ovw2#S*|Mcf;geA8=89^>Oq)WGoIUiGIDFPT&dEHJ|&^=soXnNU+v9 zY4#4Nla*@n(}P)#+2C7h(H1Z<-T55EY8@+aSOd`sh$D?XF8nk`8vw5$I%%pC|FQdR zI@oN*PKRl^K0(||&m&vBhxvxTyX|Eoqvx+Q@Q(Na?~kJ|A|1p-V6JWU6y`s+fwL+$ zE2N{1rCR9sgn-P(2dDY`KYy39mx1lMXBicha^Dqy2 zVIWgnu^Ttj`OH?)e~ci?;|>ioYIz6ppPfS5hw}kx;6c?LFZx#@4|HTRsRyf~XUsX! z)N$<3PA7~$QWkpTVVuM6$?EH$f%#s7oKVWHB3Crwr-D(;7yCLJvIDxBXfqJOo<`J; zc7WAiBvD#j&jmQF;LM*RA`g$VT>#fI2u+CDPv|P)h_x=Ou5nQnT zBs}pg)V2qWmXKTJx|xU{HRm=@1v5>BS!0*K9ve!ZW2R}b-Uqt+PL$DJE#N(iZpm-u zJ!;*96Ja6n>}tMyCpw1eZ*S0umeEPf zeyLvh$lz>Xg@NojaTR)Up(`M&cG@yguKfkwwm9qx!T!5q2B1Ih4tpp?)mbTagVT7( zn+xl|2{Syut@VE|g0u!h8nxSXvJ)a>GJPX1fCJj^uBJ~gr}NMiZ21|litbU%QdL?h zy3`L@-FQFfGLc`P)%t0ly0eJR;cr!F)RjI#E=UDF|Cn}1Fo|N@ZWb%OEm7eDbHkrg za5nD0L4nZ-_>MSXw3nUCH0>KP9`oV&ivuhuRwGxQ(i;oKI%m@)dX?&-KUYC-`VW}( z;m7K)FJuF^)ch3BlQj=8_)^quoxG=l{SXB=;*44+Pa*!6!eX2p-a#jffn2?(A%7XN ztNA>2N_G=GyAfujaM~B0wN#u79Sfwb?uh1orT5KxFCf(r-GvFVjdTn4Pp2P(Px04~ WJ3jNNYpZb8Nwdinx^I`Y`2PUJ%MKU- literal 0 HcmV?d00001 diff --git a/tests/data/t0/ag_news_prompt_text_document.idx b/tests/data/t0/ag_news_prompt_text_document.idx new file mode 100644 index 0000000000000000000000000000000000000000..dd5d3206385265e3295a64f29d7d8e446e0a46e0 GIT binary patch literal 2042 zcmYk7%S%*Y7{*^s%gU_mW@cq&Wj2{vnVFTBu^M$IwJ|mCIz>f+REUujT~^T|BrIBp zghWIu7cNBn3&KUTY$ZdBaM{vD&wHLn=!4(j)dFIQj8-bp~+3T z5xxWZf$!@8w?G%@2Jb-=7y!e-w+(^Ypc>SIUJwQ~pk6)$Uk2wuCGh}5=y22%AZ#;KS-mC(&?m}y@q*O9?oDMSWD-vqnG9H>zQMj z^iO$g19NT`Jt3EFWS)@=HZf1jg`1ft<-%;{S-ENp^DjBJl{qO7ZDY>aPLInaJD7jV z^Eu2@JL$PxI=PFU%%dxJ)5Uw}&|Z2$j^{I{?xR1+S^Jqsqhi z?xvcX>TRmEsm`Vvo9b(-uc@}Cx|-^0s;jB4rn;KyYO1TLuBMt=zOG00G}X~mM^pVw zwKLVuR6A4cOtmxB&Qv>7?M$^Z)y`BqQ|(N(Gu6&iH&fk$XRe>BY8E`7J*#HH^WU>- z7Q81st6rvBnQ9fhAAEn6e_*Ii!8^yZY7{(kJ*z&!`^EE0eXe-Gh)}`%%BKy=jdE0u s$yenjxmj+JugP(_RlY8_$?bB7d_(S(yX0=UNA8vTz>% literal 0 HcmV?d00001 diff --git a/tests/data/t0/gpt2-tiny-merges.txt b/tests/data/t0/gpt2-tiny-merges.txt new file mode 100644 index 000000000..64bc5dc65 --- /dev/null +++ b/tests/data/t0/gpt2-tiny-merges.txt @@ -0,0 +1,4744 @@ +#version: 0.2 - Trained by `huggingface/tokenizers` +Ġ t +Ġ a +h e +i n +r e +o n +Ġt he +e r +Ġ s +a t +Ġ w +Ġ o +e n +Ġ c +i t +i s +a n +o r +e s +Ġ b +e d +Ġ f +in g +Ġ p +o u +Ġa n +a l +a r +Ġt o +Ġ m +Ġo f +Ġ in +Ġ d +Ġ h +Ġan d +i c +a s +l e +Ġt h +i on +o m +l l +en t +Ġ n +Ġ l +s t +Ġ re +v e +Ġ e +r o +l y +Ġb e +Ġ g +Ġ T +c t +Ġ S +i d +o t +Ġ I +u t +e t +Ġ A +Ġ is +Ġ on +i m +a m +o w +a y +a d +s e +Ġth at +Ġ C +i g +Ġf or +a c +Ġ y +v er +u r +Ġ u +l d +Ġs t +Ġ M +' s +Ġ he +Ġ it +at ion +it h +i r +c e +Ġy ou +i l +Ġ B +Ġw h +o l +Ġ P +Ġw ith +Ġ 1 +t er +c h +Ġa s +Ġw e +Ġ ( +n d +i ll +Ġ D +i f +Ġ 2 +a g +er s +k e +Ġ " +Ġ H +e m +Ġc on +Ġ W +Ġ R +he r +Ġw as +Ġ r +o d +Ġ F +u l +at e +Ġa t +r i +p p +o re +ĠT he +Ġs e +u s +Ġp ro +Ġh a +u m +Ġa re +Ġd e +a in +an d +Ġo r +ig h +es t +is t +a b +r om +Ġ N +t h +Ġc om +Ġ G +u n +o p +0 0 +Ġ L +Ġn ot +es s +Ġe x +Ġ v +re s +Ġ E +e w +it y +an t +Ġb y +e l +o s +or t +o c +q u +Ġf rom +Ġha ve +Ġs u +i ve +ou ld +Ġs h +Ġth is +n t +r a +p e +igh t +ar t +m ent +Ġa l +u st +en d +- - +al l +Ġ O +ac k +Ġc h +Ġ le +i es +re d +ar d +â Ģ +ou t +Ġ J +Ġa b +e ar +i v +al ly +ou r +o st +g h +p t +Ġp l +as t +Ġc an +a k +om e +u d +T he +Ġh is +Ġd o +Ġg o +Ġh as +g e +' t +Ġ U +r ou +Ġs a +Ġ j +Ġb ut +Ġw or +Ġa ll +e ct +Ġ k +am e +Ġw ill +o k +Ġw he +Ġthe y +id e +0 1 +f f +ic h +p l +t her +Ġt r +. . +Ġin t +i e +u re +ag e +Ġn e +i al +a p +in e +ic e +Ġm e +Ġo ut +an s +on e +on g +ion s +Ġwh o +Ġ K +Ġu p +Ġthe ir +Ġa d +Ġ 3 +Ġu s +at ed +ou s +Ġm ore +u e +o g +ĠS t +in d +i ke +Ġs o +im e +p er +. " +b er +i z +a ct +Ġon e +Ġsa id +Ġ - +a re +Ġyou r +c c +ĠT h +Ġc l +e p +a ke +ab le +i p +Ġcon t +Ġwh ich +i a +Ġ im +Ġab out +Ġwe re +ver y +u b +Ġh ad +Ġ en +Ġcom p +, " +ĠI n +Ġu n +Ġa g +i re +ac e +a u +ar y +Ġw ould +as s +r y +Ġ âĢ +c l +o ok +e re +s o +Ġ V +ig n +i b +Ġof f +Ġt e +v en +Ġ Y +i le +o se +it e +or m +Ġ2 01 +Ġre s +Ġm an +Ġp er +Ġo ther +or d +ul t +Ġbe en +Ġl ike +as e +an ce +k s +ay s +ow n +en ce +Ġd is +ct ion +Ġan y +Ġa pp +Ġs p +in t +res s +ation s +a il +Ġ 4 +ic al +Ġthe m +Ġhe r +ou nt +ĠC h +Ġa r +Ġ if +Ġthe re +Ġp e +Ġy ear +a v +Ġm y +Ġs ome +Ġwhe n +ou gh +ac h +Ġth an +r u +on d +ic k +Ġo ver +ve l +Ġ qu +Ċ Ċ +Ġs c +re at +re e +ĠI t +ou nd +p ort +Ġal so +Ġp art +f ter +Ġk n +Ġbe c +Ġt ime +en s +Ġ 5 +op le +Ġwh at +Ġn o +d u +m er +an g +Ġn ew +-- -- +Ġg et +or y +it ion +ing s +Ġj ust +Ġint o +Ġ 0 +ent s +o ve +t e +Ġpe ople +Ġp re +Ġit s +Ġre c +Ġt w +i an +ir st +ar k +or s +Ġwor k +ad e +o b +Ġs he +Ġo ur +w n +in k +l ic +Ġ1 9 +ĠH e +is h +nd er +au se +Ġh im +on s +Ġ [ +Ġ ro +f orm +i ld +at es +ver s +Ġon ly +o ll +Ġs pe +c k +e ll +am p +Ġa cc +Ġb l +i ous +ur n +f t +o od +Ġh ow +he d +Ġ ' +Ġa fter +a w +Ġat t +o v +n e +Ġpl ay +er v +ic t +Ġc ould +it t +Ġa m +Ġf irst +Ġ 6 +Ġa ct +Ġ $ +e c +h ing +u al +u ll +Ġcom m +o y +o ld +c es +at er +Ġf e +Ġbe t +w e +if f +Ġtw o +oc k +Ġb ack +) . +id ent +Ġu nder +rou gh +se l +x t +Ġm ay +rou nd +Ġp o +p h +is s +Ġd es +Ġm ost +Ġd id +Ġad d +j ect +Ġin c +f ore +Ġp ol +on t +Ġag ain +cl ud +ter n +Ġkn ow +Ġne ed +Ġcon s +Ġc o +Ġ . +Ġw ant +Ġse e +Ġ 7 +n ing +i ew +ĠTh is +c ed +Ġe ven +Ġin d +t y +ĠW e +at h +Ġthe se +Ġp r +Ġu se +Ġbec ause +Ġf l +n g +Ġn ow +ĠâĢ ĵ +c om +is e +Ġm ake +Ġthe n +ow er +Ġe very +ĠU n +Ġse c +os s +u ch +Ġe m +Ġ = +ĠR e +i ed +r it +Ġin v +le ct +Ġsu pp +at ing +Ġl ook +m an +pe ct +Ġ 8 +ro w +Ġb u +Ġwhe re +if ic +Ġyear s +i ly +Ġd iff +Ġsh ould +Ġre m +T h +I n +Ġe v +d ay +' re +ri b +Ġre l +s s +Ġde f +Ġr ight +Ġs y +) , +l es +00 0 +he n +Ġth rough +ĠT r +_ _ +Ġw ay +Ġd on +Ġ , +Ġ1 0 +as ed +Ġas s +ub lic +Ġre g +ĠA nd +i x +Ġ very +Ġin clud +ot her +Ġim p +ot h +Ġsu b +ĠâĢ Ķ +Ġbe ing +ar g +ĠW h += = +ib le +Ġdo es +an ge +r am +Ġ 9 +er t +p s +it ed +ation al +Ġb r +Ġd own +Ġman y +ak ing +Ġc all +ur ing +it ies +Ġp h +ic s +al s +Ġde c +at ive +en er +Ġbe fore +il ity +Ġwe ll +Ġm uch +ers on +Ġth ose +Ġsu ch +Ġ ke +Ġ end +ĠB ut +as on +t ing +Ġl ong +e f +Ġth ink +y s +Ġbe l +Ġs m +it s +a x +Ġo wn +Ġpro v +Ġs et +if e +ment s +b le +w ard +Ġsh ow +Ġp res +m s +om et +Ġo b +Ġs ay +ĠS h +t s +f ul +Ġe ff +Ġg u +Ġin st +u nd +re n +c ess +Ġ ent +ĠY ou +Ġgo od +Ġst art +in ce +Ġm ade +t t +st em +ol og +u p +Ġ | +um p +Ġhe l +ver n +ul ar +u ally +Ġa c +Ġm on +Ġl ast +Ġ2 00 +1 0 +Ġst ud +u res +ĠA r +sel f +ar s +mer ic +u es +c y +Ġm in +oll ow +Ġc ol +i o +Ġm od +Ġc ount +ĠC om +he s +Ġf in +a ir +i er +âĢ Ķ +re ad +an k +at ch +e ver +Ġst r +Ġpo int +or k +ĠN ew +Ġs ur +o ol +al k +em ent +Ġus ed +ra ct +we en +Ġs ame +ou n +ĠA l +c i +Ġdiff ere +Ġwh ile +---- ---- +Ġg ame +ce pt +Ġs im +.. . +Ġin ter +e k +Ġre port +Ġpro du +Ġst ill +l ed +a h +Ġhe re +Ġwor ld +Ġth ough +Ġn um +ar ch +im es +al e +ĠS e +ĠI f +/ / +ĠL e +Ġre t +Ġre f +Ġtr ans +n er +ut ion +ter s +Ġt ake +ĠC l +Ġcon f +w ay +a ve +Ġgo ing +Ġs l +u g +ĠA meric +Ġspe c +Ġh and +Ġbet ween +ist s +ĠD e +o ot +I t +Ġe ar +Ġagain st +Ġh igh +g an +a z +at her +Ġex p +Ġo p +Ġin s +Ġg r +Ġhel p +Ġre qu +et s +in s +ĠP ro +is m +Ġf ound +l and +at a +us s +am es +Ġp erson +Ġg reat +p r +Ġs ign +ĠA n +' ve +Ġs omet +Ġs er +h ip +Ġr un +Ġ : +Ġt er +ire ct +Ġf ollow +Ġd et +ic es +Ġf ind +1 2 +Ġm em +Ġc r +e red +e x +Ġex t +ut h +en se +c o +Ġte am +v ing +ou se +as h +at t +v ed +Ġsy stem +ĠA s +d er +iv es +m in +Ġle ad +ĠB l +c ent +Ġa round +Ġgo vern +Ġc ur +vel op +an y +Ġc our +al th +ag es +iz e +Ġc ar +od e +Ġl aw +Ġre ad +' m +c on +Ġre al +Ġsupp ort +Ġ1 2 +.. .. +Ġre ally +n ess +Ġf act +Ġd ay +Ġb oth +y ing +Ġs erv +ĠF or +Ġth ree +Ġw om +Ġm ed +od y +ĠThe y +5 0 +Ġex per +t on +Ġe ach +ak es +Ġc he +Ġc re +in es +Ġre p +1 9 +g g +ill ion +Ġg rou +ut e +i k +W e +g et +E R +Ġm et +Ġs ays +o x +Ġd uring +er n +iz ed +a red +Ġf am +ic ally +Ġha pp +ĠI s +Ġch ar +m ed +v ent +Ġg ener +i ent +p le +i et +re nt +1 1 +v es +pt ion +Ġ2 0 +form ation +Ġc or +Ġoff ic +ie ld +Ġto o +is ion +Ġin f +Ġ Z +t he +o ad +Ġp ublic +Ġpro g +r ic +* * +Ġw ar +Ġp ower +v iew +Ġf ew +Ġl oc +Ġdiffere nt +Ġst ate +Ġhe ad +' ll +Ġp oss +Ġst at +re t +ant s +Ġv al +Ġis s +Ġc le +i vers +an c +Ġex pl +Ġan other +Ġ Q +Ġa v +th ing +n ce +W h +Ġch ild +Ġs ince +i red +l ess +Ġl ife +Ġde velop +itt le +Ġde p +Ġp ass +ã ĥ +Ġt urn +or n +Th is +b ers +ro ss +ĠA d +Ġf r +Ġres p +Ġsec ond +o h +Ġ / +Ġdis c +Ġ & +Ġsomet hing +Ġcomp le +Ġ ed +Ġf il +Ġmon th +a j +u c +Ġgovern ment +Ġwith out +Ġle g +Ġd ist +Ġp ut +Ġqu est +an n +Ġpro t +2 0 +Ġne ver +i ence +Ġle vel +Ġar t +Ġth ings +Ġm ight +Ġeff ect +Ġcont ro +Ġc ent +Ġ1 8 +Ġall ow +Ġbel ie +ch ool +ot t +Ġinc re +Ġfe el +Ġres ult +Ġl ot +Ġf un +ot e +Ġt y +ere st +Ġcont in +Ġus ing +Ġb ig +2 01 +Ġas k +Ġb est +Ġ ) +I N +Ġo pp +3 0 +Ġnum ber +in ess +S t +le ase +Ġc a +Ġm ust +Ġd irect +Ġg l +Ġ < +Ġop en +Ġp ost +Ġcom e +Ġse em +ord ing +Ġwe ek +ate ly +it al +Ġe l +ri end +Ġf ar +Ġt ra +in al +Ġp ri +ĠU S +Ġpl ace +Ġfor m +Ġto ld +" : +ain s +at ure +ĠTr ump +Ġst and +Ġ # +id er +ĠF r +Ġne xt +Ġs oc +Ġp ur +Ġle t +Ġl ittle +Ġh um +Ġ i +r on +1 5 +Ġ1 5 +Ġcomm un +Ġm ark +ĠThe re +Ġw r +ĠTh at +Ġin formation +w ays +Ġb us +a pp +Ġinv est +m e +Ġh ard +ain ed +e ad +Ġim port +Ġapp ro +Ġt est +Ġt ri +Ġre st +os ed +Ġf ull +Ġc are +ĠS p +Ġc ase +O N +Ġs k +Ġl ess +Ġ + +Ġpart ic +ĠP l +ab ly +u ck +is hed +ch n +b e +Ġl ist +at or +Ġto p +Ġad v +ĠB e +ru ct +Ġd em +r ation +l ing +g y +re en +g er +Ġh ome +Ġle ft +Ġbet ter +Ġd ata +Ġ1 1 +Ġatt ack +Ġpro ble +l ine +ard s +Ġbe h +r al +ĠH ow +ĠS he +ar ge +Ġ -- +: // +Ġb ro +ĠP h +at s +Ġbu ild +w w +id ed +a im +as es +en cy +Ġm ain +in ed +Ġinclud ing +Ġ { +Ġg ot +Ġint erest +Ġke ep +Ġ X +Ġe as +ain ing +Ġcl ass +âĢ ¦ +ĠN o +Ġv ar +Ġsm all +amp le +A T +Ġ ide +ĠS o +Ġre ce +Ġpol it +Ġm ov +Ġpl an +Ġper cent +iv ing +Ġc amp +Ġp ay +1 4 +s c +is ed +Ġu nt +one y +pl oy +== == +Ġdid n +ĠI nd +el s +ert ain +Ġp os +__ __ +i ver +Ġpro cess +Ġprog ram +if ied +ĠR ep +1 6 +u ro +olog y +at ter +in a +Ġn ame +ĠA ll +Ġf our +Ġret urn +v ious +b s +Ġcall ed +Ġm ove +ĠS c +ir d +Ġgrou p +Ġb re +Ġm en +Ġc ap +t en +e e +Ġd ri +le g +he re +uth or +Ġp at +Ġcur rent +id es +Ġp op +t o +ent ion +Ġal ways +Ġm il +Ġwom en +Ġ1 6 +Ġo ld +iv en +ra ph +ĠO r +r or +ent ly +Ġn ear +ĠE x +re am +s h +Ġ1 4 +Ġf ree +iss ion +st and +ĠC on +al ity +us ed +1 3 +Ġdes ign +Ġch ange +Ġch ang +Ġb o +Ġv is +em ber +Ġb ook +read y +Ġk ill +2 5 +pp ed +Ġa way +Ġab le +Ġcount ry +Ġcon st +ar n +Ġor der +A R +i or +i um +or th +1 8 +ail able +Ġs w +Ġm illion +Ġ1 3 +at ic +t ed +ĠG o +Ġo per +en g +Ġth ing +aj or +con om +ĠCom m +Ġwh y +u red +ur al +Ġs chool +b y +ĠM ar +Ġa ff +Ġd ays +Ġan n +us h +an e +I f +e g +Ġpro f +Ġhe alth +ou th +B ut +ion al +. , +Ġs ol +Ġal ready +Ġ3 0 +Ġchar act +H e +Ġf riend +E S +i ans +ic le +' d +ĠO n +Ġle ast +Ġp rom +Ġd r +Ġh ist +it her +Ġ est +i qu +1 7 +s on +Ġte ll +Ġt alk +oh n +o int +le ction +A N +Ġunt il +au gh +Ġl ater +Ġ ve +Ġv iew +end ing +iv ed +Ġwor d +w are +Ġc ost +Ġen ough +Ġg ive +ĠUn ited +Ġte chn +are nt +O R +Ġp ar +ĠD r +Ġ201 6 +r ist +er ing +Ġ  +Ġl arge +s ide +ac y +cc ess +Ġw in +Ġimport ant +Ġ19 9 +Ġdoes n +Ġ1 7 +Ġbus iness +Ġcle ar +Ġre se +" , +ur y +Ġe qu +as ter +al f +ĠAmeric an +n ect +Ġex pect +ivers ity +Ġo cc +ĠF l +Ġk ind +Ġme an +Ġp ast +Ġde v +Ġb as +le t +ra ft +Ġor gan +Ġde l +Ġper form +Ġst ory +Ġse ason +ĠC ol +Ġcl aim +Ġc ame +Ġwith in +Ġl ine +Ġpro ject +ĠA t +Ġcontro l +end ed +ĠS y +Ġa ir +iz ation +Ġ * +le y +Ġm oney +id d +Y ou +f or +Ġfam ily +Ġm aking +Ġb it +Ġpol ice +Ġhapp en +Ġ vers +on y +u ff +ĠW hen +Ġs it +ide o +l f +is on +Ġsu re +g in +Ġapp ear +Ġl ight +Ġ es +o f +Ġw ater +Ġt imes +n ot +Ġg row +Ġcomp any +ĠT e +ow s +Ġm ar +our ce +i ol +ar m +b r +Ġex ample +Ġcon c +Ġf ore +ĠT o +p ro +E N +ri es +Ġ2 5 +ĠC an +ne y +Ġact ually +Ġe ver +ur ity +ak en +ap s +Ġt ax +Ġm ajor +am a +Ġof ten +er al +Ġhum an +Ġj ob +is ter +Ġav ailable +oc r +en n +a id +iv id +Ġrec ord +? " +Ġs ing +ĠA m +id ence +Ġnew s +st er +Ġe conom +Ġfollow ing +ĠB r +is ing +Ġh our +m ost +um ent +Ġse x +Ġdes c +Ġbec ome +ĠE d +Ġto ok +Ġha ving +Ġprodu ct +a ult +A s +ar ing +Ġme ans +Ġh op +un e +Ġch o +Ġc ertain +Ġn on +Ġde al +2 4 +le ment +oc i +en e +Ġs ide +ĠP r +ĠM ay +Ġre ason +u ed +c hed +ul ation +Ġe lect +Ġoffic ial +Ġposs ible +Ġh old +and s +ot s +Ġc ity +or ies +Ġse ver +Ġchild ren +Ġon ce +Ġact iv +l er +Ġn ight +it ions +ĠJ ohn +a pe +pl ay +Ġd one +Ġl im +Ġwork ing +ĠP res +or ld +e b +ĠC o +Ġb ody +ail s +ut es +ĠM r +Ġwhe ther +Ġa uthor +ro p +Ġpro per +Ġse en +) ; +Ġf ac +ĠS u +Ġcon d +it ing +Ġcour se +Ġ } +-------- -------- +a ign +Ġev ent +Ġen g +Ġp ot +Ġin tern +i am +Ġsh ort +em pt +ã Ĥ +ĠG od +il ar +8 0 +Ġor ig +I S +our n +ab ility +it ive +Ġd am +Ġ1 00 +Ġp ress +Ġdo ing +Ġprot ect +r ing +Ġthough t +Ġquest ion +re w +ĠW ar +Ġsever al +ĠSt ate +Ġg iven +Ġf und +ĠT w +Ġw ent +an ces +w ork +p or +m y +4 0 +Ġar g +art ment +ust om +Ġpol ic +Ġme et +Ġc reat +2 2 +ĠSt ates +Ġg ames +ra w +ut ure +Ġunder stand +ur s +ĠO b +l ish +s y +Ġm akes +Ġw on +ag on +Ġh tt +Ġl ove +ent ial +Ġcomple te +p ar +ĠI m +A L +Ġacc ount + ł +ore d +ver t +Ġ ident +Ġ201 5 +Ġother s +ĠM in +i ber +ver age +The re +ition al +d d +Ġpro b +Ġyou ng +Ġal ong +Ġacc ording +Ġy et +Ġmem bers +ĠWh at +o id +ĠM an +A nd +Ġam ong +a i +Ġem ploy +ĠR es +Ġ > +Ġinv ol +Ġl ow +a f +ĠC ar +Ġh ig +ĠO ne +ĠS ec +in ation +Ġlike ly +Ġan t +ag ed +ĠR uss +Ġb en +Ġre le +F or +b ack +ĠN ot +Ġpres ident +b all +Ġacc ess +ivid ual +ĠD em +ĠE uro +6 0 +Ġkn own +ir l +ĠG r +Ġear ly +u se +iet y +âĢ ĵ +Ġf ight +Ġs ent +Ġto day +Ġmark et +" . +Ġb ased +Ġstr ong +ur ther +Ġde b +m ber +Ġproble m +Ġde ath +Ġsoc ial +im ate +A S +ort un +Ġcamp aign +er y +C h +Ġe y +i ally +Ġm us +w h +p os +Ġ er +Ġsa f +Ġmonth s +ir on +Ġv iol +Ġf ive +Ġst re +Ġplay ers +in c +al d +y ear +a un +Ġsu ccess +Ġpres ent +ere nce +Ġ201 4 +Ġsu gg +Ġpartic ular +Ġtr y +Ġsugg est +ĠCh rist +on es +Ġpri v +2 3 +Ġc rit +Ġl and +Ġloc al +if y +2 9 +Ġa ut +E D +ĠG u +Ġm ult +Ġpolit ical +Ġask ed +Ġfor mer +it ter +ri pt +Ġcl ose +Ġp ract +ĠY ork +Ġget ting +Ġac ross +Ġcom b +Ġbelie ve +Ġ z +Ġto get +Ġtoget her +ĠC ent +ir c +Ġind ividual +ĠM c +2 7 +is k +ĠE ng +Ġf ace +Ġ2 4 +Ġval ue +Ġare a +e v +Ġw rit +ĠPres ident +Ġv ot +Ġke y +Ġm om +p ut +Ġany thing +Ġexper ience +att le +Ġm ind +a ff +om m +Ġf uture +g ed +Ġc ut +Ġto t +it ch +Ġv ideo +Ġinvest ig +Ġn et +ĠM y +r ict +i en +. ) +Ġimp ro +th ough +ward s +Ġcon nect +ĠM ed +sel ves +ens ive +m b +o ber +at ors +A n +Ġ5 0 +Ġre du +res ent +Ġab ove +Ġf re +ĠEuro pe +s w +Ġam ount +ĠA pp +Ġe ither +Ġmil it +Ġan al +Ġf ail +ĠE n +al es +Ġspec ial +Ġbl ack +I T +c her +Ġlook ing +Ġf ire +y n +Ġal most +o on +Ġstud y +Ġm iss +c hes +ro wn +Ġt re +Ġcommun ity +Ġmed ia +Ġf ood +Ġcom es +ĠUn iversity +Ġsing le +Wh at +u ly +Ġh alf +ag ue +h od +ĠRep ublic +Ġstart ed +Ġqu ick +ot o +b ook +Ġiss ue +it or +Ġel se +Ġcons ider +2 6 +ro du +Ġt aken +2 8 +9 9 +ĠW ith +Ġtr ue +Ġw a +Ġtr ad +Ġag o +Ġm ess +ie f +Ġadd ed +o ke +Ġb ad +Ġf av +3 3 +Ġsim ilar +as k +ĠD on +Ġcharact er +ort s +ĠH ouse +Ġreport ed +Ġty pe +v al +i od +ĠHow ever +Ġt arg +Ġent ire +pp ing +Ġhist ory +Ġl ive +ff ic +.... .... +ed eral +Ġtr ying +Ġdisc uss +ĠH ar +ac es +l ished +Ġse lf +os p +re st +Ġro om +el t +Ġf all +ol ution +Ġe t +Ġ x +Ġis n +Ġide a +b o +Ġs ound +ĠD ep +Ġsome one +ci ally +ull y +Ġf oc +Ġob ject +if t +ap er +Ġplay er +Ġr ather +Ġserv ice +as hing +ĠD o +ĠP art +ru g +m on +p ly +Ġm or +Ġnot hing +Ġprov ide +I C +un g +Ġpart y +Ġex ist +Ġm ag +7 0 +Ġr ul +Ġh ouse +Ġbeh ind +Ġhow ever +ĠW orld +Ġs um +Ġapp lic +Ġ ; +Ġfun ction +g r +ĠP ol +Ġfr ont +2 00 +Ġser ies +Ġt em +Ġty p +ill s +Ġo pt +Ġpoint s +Ġbel ow +itt ed +Ġspec ific +Ġ201 7 +um b +Ġr a +Ġpre vious +Ġpre t +re me +Ġc ustom +Ġcour t +ĠM e +Ġre pl +Ġwho le +g o +c er +Ġt reat +ĠA ct +Ġprob ably +Ġle arn +end er +ĠA ss +Ġvers ion +n ow +Ġche ck +ĠC al +R E +min ist +O n +our ces +Ġben ef +Ġd oc +Ġdet er +Ġen c +Ġsu per +Ġadd ress +Ġv ict +Ġ201 3 +Ġme as +t r +Ġf ield +W hen +Ġsign ific +u ge +Ġfe at +Ġcomm on +l oad +Ġbe gin +Ġbr ing +Ġa ction +er man +Ġdesc rib +Ġind ust +Ġwant ed +ri ed +m ing +Ġatt empt +4 5 +f er +Ġd ue +ress ion +# # +Ġsh all +Ġs ix +o o +Ġst ep +Ġp ub +Ġhim self +Ġ2 3 +Ġc op +Ġd est +Ġst op +A C +ib ility +Ġl ab +ic ult +Ġhour s +Ġcre ate +Ġf urther +ĠAmeric a +ĠC ity +Ġd ou +he ad +S T +ĠN orth +c ing +Ġn ational +u le +ĠIn st +Ġt aking +ĠQ u +ir t +Ġre d +Ġrese arch +v iron +ĠG e +Ġbre ak +an a +Ġsp ace +ater ial +Ġrec ent +ĠA b +Ġgener al +Ġh it +Ġper iod +Ġevery thing +ive ly +Ġph ys +Ġsay ing +an ks +Ġc ou +Ġc ult +ac ed +e al +u ation +Ġc oun +l u +Ġinclud e +Ġpos ition +ĠA fter +ĠCan ad +ĠE m +Ġim m +ĠR ed +Ġp ick +Ġcom pl +Ġm atter +re g +e xt +ang u +is c +o le +a ut +Ġcomp et +e ed +f ect +Ġ2 1 +ĠS en +ĠThe se +as ing +Ġcan not +Ġin it +Ġrel ations +ac hed +Ġb ar +Ġ4 0 +ĠT H +Ġ201 2 +Ġv ol +Ġg round +Ġsec urity +Ġup d +il t +3 5 +Ġconc ern +ĠJ ust +Ġwh ite +Ġseem s +ĠH er +pe cially +i ents +Ġann oun +Ġf ig +ight s +Ġst ri +l ike +id s +Ġs us +Ġw atch +Ġ â +Ġw ind +ĠC ont +Ġit self +Ġm ass +A l +y le +iqu e +ĠN ational +Ġab s +Ġp ack +Ġout side +Ġan im +Ġp ain +et er +Ġman ag +du ct +og n +Ġ ] +ĠSe pt +se c +o ff +ĠJ an +Ġf oot +ad es +Ġth ird +Ġm ot +Ġev idence +int on +Ġth reat +a pt +pl es +c le +Ġl o +Ġde cl +Ġit em +med i +Ġrep resent +om b +am er +Ġsignific ant +og raph +s u +Ġc al +i res +00 00 +I D +A M +Ġsim ply +Ġlong er +Ġf ile +O T +c he +S o +ate g +or g +ĠH is +Ġen er +Ġd om +Ġup on +il i +": " +Ġthem selves +Ġcom ing +Ġqu ite +Ġdiff icult +ĠB ar +il ities +re l +end s +c ial +6 4 +Ġwom an +ra p +y r +Ġne cess +ip s +Ġte xt +Ġrequ ire +Ġmilit ary +Ġre view +Ġresp ons +7 5 +Ġsub ject +Ġinst ead +Ġiss ues +Ġg en +" ," +Ġmin utes +Ġwe ap +r ay +am ed +t ime +b l +H ow +Ġc ode +ĠS m +Ġhig her +ĠSt e +r is +Ġp age +Ġstud ents +ĠIn tern +Ġmet hod +ĠA ug +ĠP er +ĠA g +Ġpolic y +ĠS w +Ġex ec +Ġac cept +um e +rib ut +Ġword s +Ġfin al +Ġchang es +ĠDem ocr +Ġfriend s +Ġres pect +Ġe p +Ġcomp an +iv il +Ġdam age +** ** +og le +viron ment +Ġne g +ent al +Ġa p +Ġtot al +iv al +! " +l im +Ġneed s +Ġag re +Ġdevelop ment +Ġa ge +ip le +2 1 +Ġresult s +ĠA f +S h +Ġg un +ĠOb ama +ro ll +Ġ @ +Ġright s +ĠB rit +Ġrun ning +Ġwas n +Ġp ort +Ġr ate +Ġpret ty +Ġtarg et +Ġsa w +Ġc irc +Ġwor ks +ic ro +al t +o ver +ww w +Th at +l ier +Ġevery one +ud e +Ġp ie +idd le +ra el +Ġr ad +Ġbl ock +Ġw alk +T o +ã ģ +n es +ĠA ust +a ul +ro te +ĠS outh +ess ion +op h +Ġshow s +Ġs ite +Ġj o +Ġr isk +cl us +l t +Ġin j +id ing +ĠS pe +Ġch all +ir m +Ġ2 2 +itt ing +st r +Ġh y +L E +ke y +Ġbe gan +at ur +ashing ton +l am +ĠD av +b it +Ġs ize +ĠP ar +3 8 +ourn al +f ace +Ġdec ision +Ġl arg +Ġj ud +re ct +Ġcontin ue +ĠO ct +ove red +ĠI nt +==== ==== +Ġp arent +ĠW ill +Ġeas y +Ġd rug +ang er +Ġs ense +Ġd i +id ay +Ġener gy +ist ic +Ġass oci +ar ter +ob al +e ks +ĠE l +ur ch +Ġg irl +o e +it le +Ġ2 8 +ĠC he +Ġrequ est +Ġso on +Ġh ost +k y +Ġst ates +om es +Ġm aterial +le x +Ġmom ent +Ġan sw +on se +Ġes pecially +Ġn orm +Ġserv ices +p ite +r an +Ġro le +4 4 +) : +Ġc red +C l +____ ____ +Ġm at +Ġl og +ĠCl inton +O U +Ġoff ice +Ġ2 6 +Ġch arg +Ġtr ack +m a +Ġhe art +Ġb all +Ġperson al +Ġbuild ing +n a +s et +b ody +ĠBl ack +Ġincre ase +itt en +Ġneed ed +3 6 +3 2 += " +Ġl ost +Ġbec ame +Ġgrou ps +ĠM us +Ġw rote +ĠP e +Ġpro p +j oy +à © +ĠWh ite +Ġde ad +. ' +Ġhtt p +Ġwe bs +O S +Ġins ide +Ġwr ong +Ġstat ement +Ġ ... +y l +Ġfil m +Ġmus ic +Ġsh are +ific ation +Ġre lease +Ġfor ward +Ġst ay +Ġcomp ut +it te +s er +Ġorig inal +Ġc ard +Ġc and +Ġd iv +at ural +Ġfav or +O M +Ġc ases +us es +Ġse ction +Ġle ave +g ing +ov ed +ĠW ashington +3 9 +ĠG l +Ġrequ ired +act ion +ap an +o or +it er +ĠK ing +Ġcount ries +ĠG erman +ll ing +Ġ2 7 +3 4 +Ġquest ions +Ġpr im +Ġc ell +Ġsh oot +Ġany one +ĠW est +Ġaff ect +ep end +Ġon line +ĠIs rael +ĠSept ember +Ġab ility +Ġcont ent +is es +Ġre ve +Ġl aun +Ġind ic +Ġfor ce +c ast +Ġso ld +av ing +f l +Ġso ft +Ġcompan ies +ce ed +Ġart icle +Ġa ud +Ġre v +Ġed uc +Ġplay ing +0 5 +Ġhe ld +ct or +Ġrele ased +Ġf ederal +3 7 +Ġad minist +Ġinter view +Ġinst all +Ġrece ived +Ġs ource +u k +P h +Ġser ious +Ġcre ated +Ġc ause +Ġim medi +Ġdef in +u el +ĠDep artment +ct ions +ĠC our +ĠN ow +z e +it es +it ution +Ġl ate +Ġspe ak +n ers +Ġleg al +ar i +ĠC or +Ġwe eks +Ġmod el +Ġp red +Ġex act +B C +ĠB y +IN G +os ing +Ġt akes +Ġreg ard +Ġopp ortun +Ġpr ice +Ġ19 8 +ĠA pr +f ully +Ġor d +Ġproble ms +ru ction +h am +ĠC ount +le ge +Ġlead ers +E T +le v +Ġde ep +olog ical +es e +h aps +ĠS ome +Ġp ers +Ġcont ract +Ġrelations hip +s p +ou d +Ġb ase +4 8 +m it +A d +anc ial +Ġcons um +Ġpot ential +Ġl angu +re m +et h +Ġrel ig +ress ed +6 6 +Ġl ink +Ġl ower +ay er +ĠJ une +Ġf em +un t +er c +ur d +Ġcont act +Ġ ill +Ġm other +Ġest ab +h tt +ĠM arch +ĠB ro +ĠCh ina +Ġ2 9 +Ġs qu +Ġprov ided +Ġa verage +as ons +Ġ201 1 +Ġex am +l in +5 5 +n ed +Ġper fect +Ġt ou +al se +u x +Ġbu y +Ġsh ot +Ġcol lect +Ġph ot +Ġplay ed +Ġsur pr +Ġofficial s +Ġsim ple +av y +Ġindust ry +Ġhand s +g round +Ġp ull +Ġr ound +Ġus er +Ġr ange +u ary +Ġpriv ate +op s +e es +Ġw ays +ĠM ich +Ġve h +Ġex cept +Ġter ms +im um +pp er +I ON +ore s +ĠDr agon +ou l +Ġd en +Ġperform ance +Ġb ill +c il +4 7 +Ġen vironment +Ġex c +ad d +Ġwor th +Ġp ict +Ġch ance +Ġ201 8 +b or +Ġspe ed +ict ion +Ġal leg +ĠJ apan +at ory +re et +Ġm atch +ĠI I +Ġst ru +ord er +Ġst e +Ġl iving +Ġst ruct +in o +Ġse par +her n +Ġresp onse +Ġen joy +Ġv ia +A D +um ents +ace book +Ġmem ber +ib r +iz ing +Ġto ol +ĠM on +ĠWh ile +h ood +ĠA ng +ĠD ef +Ġoff er +T r +a ur +Ġturn ed +ĠJ uly +d own +an ced +Ġrec ently +ĠE ar +Ġc e +ĠSt ar +ĠC ong +rough t +Ġbl ood +Ġhop e +Ġcom ment +ain t +Ġar ri +il es +Ġpartic ip +ough t +ri ption +0 8 +4 9 +Ġg ave +Ġse lect +Ġkill ed +sy ch +Ġgo es +i j +Ġc oll +Ġimp act +at ives +ĠS er +0 9 +ĠAug ust +Ġb oy +d e +ĠD es +Ġf elt +U S +Ġexpect ed +Ġim age +ĠM ark +cc ording +o ice +E C +ĠM ag +en ed +h old +ĠP ost +Ġpre vent +N o +Ġinvol ved +Ġey es +Ġquick ly +A t +un k +Ġbeh av +Ġ ur +Ġl ed +c ome +e y +Ġcand id +Ġear lier +Ġfoc us +et y +P ro +led ge +ix ed +ill ed +Ġpop ular +A P +Ġset t +l ight +Ġvar ious +in ks +Ġlevel s +Ġro ad +ell ig +ab les +he l +itte e +ĠG ener +y pe +Ġhe ard +ic les +Ġm is +Ġus ers +ĠS an +Ġimpro ve +Ġf ather +Ġse arch +The y +v il +Ġprof ess +Ġkn ew +Ġl oss +Ġev ents +6 5 +Ġb illion +0 7 +0 2 +ĠNew s +ĠA M +Ġco ver +w here +ens ion +Ġb ott +Ġare as +en ces +op e +ĠTw itter +a el +Ġget s +ĠGo ogle +Ġs n +i ant +Ġv ote +Ġnear ly +Ġinclud ed +Ġrec ogn +z z +m m +al ed +Ġhappen ed +0 4 +Ġh ot +Ġwho se +Ġc ivil +Ġsu ff +o es +it iz +ĠSy ri +Ġresp ond +Ġh on +Ġfeat ures +Ġeconom ic +ĠApr il +r im +Ġtechn ology +Ġo ption +ag ing +Ġpur ch +R e +Ġl at +ch ie +is l +Ġrec omm +u f +Ġtr aining +Ġeffect s +Ġf ast +Ġ201 0 +Ġocc ur +Ġwebs ite +Ġem ail +Ġs ens +e ch +Ġo il +Ġinf lu +Ġcurrent ly +ĠS ch +ĠAd d +Ġgo al +Ġsc ient +Ġcon v +1 00 +em y +Ġdec ided +Ġtra vel +Ġm ention +L L +0 3 +Ġe lection +Ġph one +Ġlook s +Ġsit uation +Ġc y +Ġh or +b ed +ĠCour t +a ily +av es +Ġqu ality +ĠCom p +w ise +Ġt able +Ġst aff +ĠW ind +et t +Ġtri ed +ide red +Ġadd ition +Ġb ox +Ġl ack +ar ily +Ġw ide +Ġm id +Ġbo ard +ys is +Ġant i +h a +Ġd ig +en ing +Ġd ro +C on +6 8 +Ġsl ow +b ased +se qu +Ġp ath +E x +ak er +Ġwork ed +Ġp en +Ġeng ine +Ġlook ed +ĠSu per +ĠS erv +Ġvict im +U n +Ġproper ty +Ġint rodu +Ġexec ut +ĠP M +L e +Ġcol or +ĠM ore +Ġ6 0 +Ġnet work +Ġd ate +c ul +id ge +Ġext ra +3 1 +Ġs le +6 7 +Ġw ond +Ġreport s +j ust +ĠAust ral +Ġcap ital +Ġen s +Ġcomm and +Ġallow ed +Ġpre p +Ġca pt +h ib +Ġnum bers +ch an +Ġf air +m p +om s +Ġre ach +W ith +t ain +Ġbro ad +Ġcou ple +ec ause +ly ing +ĠF eb +Ġsc reen +Ġl ives +Ġpri or +ĠCong ress +A r +Ġappro ach +Ġe mer +ar ies +ĠD is +s erv +ĠN e +Ġbu ilt +c ies +Ġre pe +Ġrul es +for ce +ĠP al +Ġfin ancial +Ġcons idered +ĠCh ar +n ces +ĠI S +Ġb rought +Ġb i +i ers +ĠS im +O P +Ġproduct s +Ġvis it +Ġdoc ument +Ġcon duct +Ġcomplete ly +in ing +ĠCal if +ib ly +Ġwr itten +ĠT V +em ents +Ġd raw +O ne +Ġpub lished +Ġsec ret +r ain +he t +ĠF acebook +ond ay +ĠU p +Ġsex ual +Ġth ous +ĠP at +Ġ ess +Ġstand ard +Ġar m +g es +ect ion +Ġf ell +Ġfore ign +an i +ĠFr iday +Ġreg ular +in ary +Ġincre ased +Ġus ually +Ġdem on +Ġd ark +Ġadd itional +ro l +ĠO f +Ġprodu ction +! ! +und red +Ġintern ational +id ents +ĠF ree +rou p +Ġr ace +Ġm ach +Ġh uge +A ll +le ar +ove mber +Ġto wn +Ġatt ention +ĠO ff +y ond +ĠThe n +f ield +Ġter ror +ra z +ĠB o +Ġmeet ing +ĠP ark +Ġar rest +Ġf ear +Ġa w +ĠV al +or ing +' , +Ġext reme +ar r +Ġwork ers +A fter +Ġ3 1 +n et +am ent +Ġdirect ly +Ġpop ulation +ub e +ĠOct ober +ĠI N +ĠJan uary +5 9 +ĠDav id +Ġc ross +ce mber +ĠF irst +Ġmess age +ir it +Ġn ation +Ġp oll +is ions +Ġansw er +n y +is ode +Ġcar ry +ĠRuss ia +Ġhe ar +eng th +ro y +Ġn atural +in ally +Ġdo g +m itted +Ġtr ade +Ġsub st +Ġmult iple +ĠAf ric +Ġf ans +Ġs ort +Ġgl obal +ic ation +ĠW ed +ar a +Ġa chie +Ġlangu age +ve y +Ġt al +Ġnecess ary +Ġdet ails +Ġs en +ĠS und +ĠRe g +ĠR ec +0 6 +Ġs il +ress ive +Ġmed ical +un ch +orn ia +Ġu nd +f ort +oc ks +ĠM onday +ues day +c raft +7 7 +ur t +Ġ ver +ĠH ill +Ġrece ive +Ġmor ning +es tern +Ġb ank +Ġs at +ir th +ĠH igh +Ġdev ice +ĠTH E +ĠCent er +Ġsaf e +Ġp le +ĠCanad a +Ġsystem s +Ġass ist +Ġsur v +Ġb attle +ĠS oc +vert is +S he +Ġp aper +Ġgrow th +Ġc ast +S c +Ġpl ans +ll ed +Ġpart s +Ġw all +Ġmove ment +Ġpract ice +im ately +Ġdis play +Ġsomet imes +om p +ĠP aul +ĠY es +k ing +5 8 +o ly +Ġs on +Ġav oid +ok es +ĠJ ew +Ġto wards +as c +Ġ // +ĠK ore +Ġtalk ing +Ġcor rect +Ġsp ent +ic ks +i able +e ared +Ġter m +Ġwant s +om ing +Ġ ut +Ġdou b +Ġfor ces +Ġp lease +6 9 +ĠN ovember +at form +ond on +Ġon es +Ġimmedi ately +ĠRuss ian +ĠM et +Ġde g +Ġparent s +C H +ĠAmeric ans +al y +ĠM od +Ġsh own +Ġcond itions +Ġst uff +Ġre b +ĠY our +Ġinclud es +n own +ĠS am +Ġexper ien +m ission +ĠE ven +augh t +Ġannoun ced +ĠRepublic an +Ġdeter min +Ġdescrib ed +ĠCount y +( ) +Ġdo or +Ġchang ed +Ġne igh +ĠH ere +Ġcle an +Ġp an +ĠDe cember +ĠEurope an +ir ing +ap ter +Ġcl ub +ĠT uesday +Ġp aid +ĠN et +Ġattack s +Ġcharact ers +Ġal one +Ġdirect or +d om +Ġ3 5 +Ġl oad +Ġr out +ĠCalif ornia +Ġfin ally +Ġr ac +Ġcont r +Ġexact ly +res h +p ri +ĠIs lam +Ġn ature +Ġcare er +Ġlat est +Ġcon vers +ĠS l +p ose +ci ent +ĠIn c +iv ity +8 8 +ĠA tt +ĠM or +nes day +Ġwe ight +k en +Ġnot e +Ġteam s +Ġ \ +air s +ĠG reen +Ġh undred +on ent +Ġstre ng +Ġcons ist +ic ated +Ġreg ul +Ġl ic +ast ic +Ġt en +urs day +ellig ence +ous ly +ĠU K +B I +Ġcost s +Ġind epend +ĠA P +Ġnorm al +Ġh om +Ġob vious +Ġs we +Ġst ar +Ġread y +ac her +Ġimp lement +g est +Ġs ong +ĠG et +ĠL ab +Ġinterest ing +us ing +Ġg iving +ĠSund ay +Ġet c +Ġm iddle +Ġrem ember +r ight +os ition +ut ions +Ġm ax +4 6 +Ġyour self +Ġdem and +Ġtreat ment +Ġd anger +ĠC ons +Ġgu y +ĠBrit ish +Ġphys ical +Ġrel ated +Ġrem ain +Ġcould n +Ġref er +Ġc itiz +b ox +EN T +bo ard +Ġin n +I G +er o +ĠSt reet +osp ital +ren ch +cher s +Ġst ra +O L +ag er +ĠA N +Ġeas ily +I A +en ge +in y +Ġcl os +ock ed +Ġus es +ĠC oun +I m +u ild +? ? +m ore +Ġan g +Ġwr ite +ol ute +5 7 +Ġlead er +Ġread ing +< / +Ġaut om +est s +4 3 +Ġleg isl +ĠG old +Ġdesign ed +ĠS T +ĠLe g +a res +Ġbe aut +ĠT ex +Ġappear s +Ġstru gg +ĠR om +Ġ 00 +Ġcho ice +Ġparticular ly +ĠF rom +op er +ĠL ondon +ann ed +Ġallow s +ob ile +Ġdiffere nce +âĢ ¢ +ĠV iew +ĠWed nesday +Ġal though +Ġrel ative +Ġapplic ation +ate ver +Ġare n +Ġmy self +Ġim ag +Ġdis e +Ġsoc iety +Ġfre qu +ĠEng lish +Ġpo or +ĠD ay +Ġwrit ing +Ġse ven +Ġstart ing +Ġb ud +Ġpr int +ĠTr ans +uf act +ĠSt ud +n ew +Ġcr im +Ġg ives +Ġco ol +a e +i ance +ĠGener al +Ġthink ing +Ġsa ve +Ġlim ited +ĠPart y +Ġmean ing +p en +ow ers +ĠJ ack +E M +Ġn ice +ru pt +Ġg as +Ġe ight +Ġfe et +Ġeff ort +Ġ ign +ic it +B l +co in +Ġop in +Ġbr ain +Wh ile +he st +ĠTh ursday +Ġwould n +augh ter +Ġtou ch +le ments +Ġstud ies +Ġcent er +c ont +or ge +Ġcomput er +Ġinvestig ation +P l +or ks +Ġ200 8 +Ġincre asing +Ġst ore +Ġcom ments +Ġb al +m en +Ġdo ll +Ġl iber +Ġw ife +Ġlaw s +atur day +it ness +Ġmod ern +ĠS k +Ġadminist ration +Ġopportun ity +Ġs al +Ġpower ful +M y +Ġclaim s +ĠEar th +ord s +Ġt itle +Ġes c +n ame +N ot +om en +Ġbe yond +Ġc amer +Ġse ll +it ute +ear ch +Ġapp l +im ent +4 2 +ĠAr t +Ġun f +Ġviol ence +ur g +ĠE ast +Ġcomp ared +Ġopt ions +Ġthrough out +Ġv s +ig r +. [ +ac hes +7 8 +Ġfil es +F L +E L +ar ian +ĠJ ames +ĠA ir +an ch +Ġdet ail +Ġpie ce +P S +Ġn amed +Ġeduc ation +Ġdri ve +Ġitem s +Ġstud ent +ic ed +: : +ic o +Ġth row +Ġsc ene +Ġcomple x +Ġ200 9 +Ġpre c +ĠB re +7 9 +Ġcon cept +Ġstat us +am ing +Ġd ied +Ġknow ledge +Ġbegin ning +O D +ru ary +Ġcertain ly +Ġgu ys +Ġsl ight +in n +ound s +Ġf ine +Ġf at +ic ations +Ġper haps +ĠA nt +Ġinc ome +Ġhtt ps +Ġmajor ity +port s +st on +Ġgreat er +Ġfe ed +ent ially +Ġsaf ety +Ġun ique +and om +Ġg one +Ġshow ed +Ġhist or +Ġcoun ter +i us +id a +Ġlead ing +i pe +Ġs end +ĠDon ald +er ve +Ġdef ense +ines e +Ġy es +ĠF ire +ĠMus lim +ra q +Ġcontin ued +os h +Ġprov ides +Ġpr ison +ĠP re +Ġhapp y +Ġeconom y +Ġtr ust +ag s +ĠG ame +Ġweap ons +um an +ĠC le +it ation +Ġanal ysis +ĠT imes +Ġsc ience +- > +Ġfig ure +Ġdis app +ent y +Ġsoft ware +Ġu lt +Ġoffic ers +N ew +I s +Ġrem ains +ĠInd ia +Ġp sych +ri ef +Ġc at +es c +Ġob serv +Ġst age +ĠD ark +Ġent er +ch ange +Ġpass ed +Ġdes pite +ĠO ut +Ġmov ie +r s +Ġv oice +m ine +ĠPl ay +Ġto ward +ĠT er +Ġreg ion +Ġval ues +or ters +Ġm ount +Ġoffic er +ĠO ther +b an +Ġh ous +w ood +ro om +I V +ĠS un +se e +ĠO ver +ro g +9 0 +Ġl ay +ĠT ur +a wn +Ġpress ure +ĠS ub +Ġbook s +ed om +ĠS and +A A +ag o +Ġre asons +f ord +Ġactiv ity +U T +N ow +ĠSen ate +ce ll +n ight +Ġcall s +in ter +Ġlet ter +ĠR ob +ĠJ e +Ġcho ose +ĠL aw +G et +B e +Ġro b +Ġtyp es +Ġpl atform +Ġqu arter +R A +ĠT ime +Ġmay be +ĠC r +9 5 +p re +Ġmov ing +Ġl if +Ġgo ld +Ġs om +Ġpat ients +Ġtr uth +ĠK e +ur ance +ant ly +m ar +Ġchar ge +ĠG reat +Ġce le +---------------- ---------------- +Ġro ck +ro id +an cy +Ġcred it +a ud +B y +ĠE very +Ġmov ed +ing er +rib ution +Ġn ames +Ġstra ight +ĠHe alth +ĠW ell +Ġfe ature +Ġr ule +Ġsc he +in ated +ĠMich ael +ber g +4 1 +il ed +b and +Ġcl ick +ĠAng el +on ents +Â Ń +ĠI raq +ĠS aturday +Ġa ware +p art +Ġpat tern +O W +ĠL et +Ġgr ad +ign ed +Ġassoci ated +Ġst yle +n o +i ation +a ith +il ies +Ġst ories +ur ation +Ġindividual s +ĠâĢ ¦ +m iss +ĠAss oci +ish ing +ab y +Ġsum mer +ĠB en +Ġ3 2 +Ġar ch +ut y +ĠTex as +h ol +Ġfull y +Ġm ill +Ġfollow ed +ĠB ill +ĠInd ian +ĠSec ret +ĠB el +ĠFeb ruary +Ġjob s +Ġseem ed +ĠGo vern +i pped +Ġreal ity +Ġl ines +Ġp ark +Ġmeas ure +ĠO ur +I M +Ġbro ther +Ġgrow ing +Ġb an +Ġest im +Ġc ry +ĠS chool +Ġme chan +ĠO F +ĠWind ows +Ġr ates +ĠO h +Ġpos itive +Ġcult ure +ist ics +ic a +Ġh ar +y a +ite ly +i pp +Ġm ap +en cies +ĠWill iam +I I +ak ers +5 6 +ĠM art +ĠR em +Ġal tern +it ude +Ġco ach +row d +D on +Ġk ids +Ġj ournal +Ġcor por +Ġf alse +Ġwe b +Ġsle ep +Ġcont ain +Ġst o +Ġb ed +iver se +ĠR ich +ĠCh inese +Ġp un +Ġme ant +k nown +Ġnot ice +Ġfavor ite +a ven +Ġcond ition +Ġpur pose +) ) +Ġorgan ization +Ġchall eng +Ġman ufact +Ġsus p +ĠA c +Ġcrit ic +un es +uc lear +Ġm er +vent ion +Ġ8 0 +Ġm ist +ĠU s +ĠT or +htt p +ol f +Ġlarg er +Ġadv ant +Ġrese ar +Ġact ions +m l +Ġke pt +Ġa im +, ' +c ol +Ġbenef its +if ying +Ġact ual +ĠIntern ational +Ġveh icle +Ġch ief +Ġeff orts +ĠLe ague +ĠM ost +Ġwa it +Ġad ult +Ġover all +Ġspe ech +Ġhigh ly +Ġfem ale +Ġer ror +Ġeffect ive +5 4 +Ġenc our +w ell +Ġfail ed +Ġcons erv +Ġprogram s +Ġt rou +Ġa head +5 00 +vertis ement +I P +ĠF ound +p ir +Ġ % +Ġcr ime +and er +Ġloc ation +ĠI ran +Ġbehav ior +az ing +Ġr are +Ġem b +Ġca used +Ġsh ip +Ġact ive +Ġcont ribut +Ġg reen +Ġac qu +Ġref lect +ven ue +Ġf irm +Ġb irth +] . +Ġclear ly +Ġem ot +Ġag ency +ri age +Ġmem ory +9 8 +S A +ĠSe e +ac ing +C C +Ġbig gest +Ġr ap +Ġbas ic +Ġb and +e at +Ġsus pect +ĠM ac +Ġ9 0 +m ark +ist an +Ġsp read +am s +k i +as y +ra v +ĠR ober +Ġdemon str +r ated +Ġabs olute +Ġpl aces +Ġim pl +ibr ary +Ġc ards +Ġdest roy +Ġv irt +ve re +Ġapp eared +y an +p oint +Ġbe g +Ġtem per +s pe +ant ed +ear s +ĠD irect +Ġl ength +Ġbl og +am b +Ġint eg +Ġres ources +ac c +if ul +Ġsp ot +Ġfor ced +Ġthous ands +ĠMin ister +Ġqu al +ĠF rench +at ically +Ġgener ally +Ġdr ink +Ġth us +I L +od es +Ġappro pri +ĠRe ad +Ġwh om +Ġey e +Ġcol lege +Ġ4 5 +ire ction +Ġens ure +Ġapp arent +id ers +Ġrelig ious +Ġmin or +ol ic +Ġt ro +ĠWh y +rib ute +m et +Ġprim ary +Ġdevelop ed +Ġpe ace +Ġsk in +st e +av a +Ġbl ue +Ġfam ilies +Ġ ir +Ġapp ly +Ġin form +ĠSm ith +C T +i i +Ġlim it +Ġres ist +........ ........ +um n +Ġconf lic +Ġtw e +ud d +ĠT om +Ġl iter +qu e +b on +Ġha ir +Ġevent ually +Ġp us +Ġhelp ed +Ġag g +or ney +ĠApp le +Ġf it +ĠS ur +Ġpre m +Ġs ales +Ġsecond s +Ġstreng th +Ġfeel ing +¿ ½ +Ġt our +Ġknow s +o om +Ġex erc +Ġsom ew +ï ¿½ +> > +Ġsp okes +Ġide as +Ġreg ist +so ft +ĠD el +ĠP C +Ġpro pos +Ġlaun ch +Ġbott om +T H +ĠP lease +v est +it z +ĠIn ter +Ġsc ript +Ġr at +ar ning +Ġ il +ĠJ er +ĠA re +Ġwh atever +ok en +ci ence +Ġmod e +Ġag ree +Ġs ources +Ġinit ial +Ġrest rict +Ġwond er +us ion +## ## +ĠS il +vil le +Ġb urn +t w +as ion +Ġ £ +Ġn or +u ing +Ġre ached +Ġs un +Ġc ateg +ig ration +Ġc ook +Ġprom ot +Ġm ale +Ġcl imate +Ġf ix +Ġalleg ed +U R +all ed +Ġim ages +C ont +ot a +Ġschool s +i os +Ġd rop +Ġst ream +ĠM o +Ġprevious ly +al ing +Ġp et +Ġdou ble +Ġ( @ +ann el +Ġdef ault +t ies +Ġr ank +ĠD ec +ĠCoun cil +Ġweap on +Ġst ock +Ġanal y +ĠSt r +Ġpict ure +ĠPol ice +f erence +Ġcent ury +Ġcitiz ens +Ġon to +Ġexp and +Ġhe ro +ĠS ol +Ġw ild +Ġupd ate +Ġcustom ers +r ont +d ef +Ġl ik +Ġcrim inal +ĠChrist ian +S P +7 6 +Ġle aving +Ġother wise +ĠD ist +Ġbas is +5 2 +5 3 +ic ip +ĠB er +Ġrecomm end +Ġfl oor +Ġc rowd +ol es +Ġ7 0 +Ġcent ral +ĠE v +Ġd ream +Ġdown load +Ġconf ir +ĠTh om +Ġwind ow +Ġhapp ens +Ġun it +Ġt end +Ġs pl +Ġbec omes +Ġfight ing +Ġpred ict +ĠP ress +ĠP ower +Ġhe avy +ak ed +Ġf an +or ter +ate gy +B A +iz es +Ġsp end +H ere +Ġ200 7 +Ġad op +ĠH am +Ġfoot ball +ĠP ort +od ay +5 1 +amp ions +Ġtrans fer +h t +Ġ3 8 +ter m +ac ity +Ġb ur +] , +tern al +r ig +b ut +Ġthere fore +ĠB ecause +res p +re y +Ġm ission +S ome +Ġnot ed +Ġass um +Ġdise ase +Ġed it +Ġprog ress +r d +ĠB rown +oc al +Ġadd ing +Ġra ised +ĠAn y +Ġt ick +Ġsee ing +ĠPe ople +Ġagre ement +Ġser ver +Ġw at +Ġdeb ate +Ġsupp osed +il ing +Ġlarg est +Ġsuccess ful +ĠP ri +ĠDemocr atic +Ġj ump +ĠSyri a +Ġown ers +Ġoff ers +Ġshoot ing +Ġeff ic +se y +Ġha ven +ver se +te red +ĠL ight +im al +ĠB ig +Ġdef end +Ġbe at +Ġrecord s +% ) +Ġsc en +Ġemploy ees +Ġdev ices +he m +Ġcom mer +ĠM ex +Ġbenef it +ĠPro f +Ġil leg +Ġsur face +ĠAl so +Ġh arm +ing ly +w ide +ĠA lex +Ġsh ut +ĠC ur +Ġl ose +p m +Ġchall enge +se mb +Ġst ation +Ġint elligence +Ġacc ur +ĠFl or +Ġrequ ires +ĠM al +b um +Ġh ospital +Ġsp irit +Ġoff ered +Ġprodu ce +ĠComm un +Ġcreat ing +Ġcr is +s pect +Ġend ed +Ġd aily +Ġvot ers +land s +i as +i h +on a +Ġsm art +ĠOff ice +ĠL ord +ri al +ĠIntern et +Ġcirc um +Ġextreme ly +' . +Ġopin ion +ĠM il +Ġg ain +B S +ĠF in +y p +Ġuse ful +Ġbud get +Ġcom fort +is f +Ġback ground +el ine +Ġep isode +Ġen emy +Ġtri al +Ġestab lish +d ate +ĠC ap +Ġcontin ues +Ġshow ing +ĠUn ion +w ith +Ġpost ed +ĠSy stem +Ġe at +ri an +Ġr ise +ĠGerman y +il s +Ġsign ed +Ġv ill +Ġgr and +m or +ĠEng land +Ġproject s +um ber +Ġconf erence +z a +Ġrespons ible +ĠAr ab +Ġlearn ed +âĢĶ âĢĶ +i pping +ĠGe orge +O C +Ġreturn ed +ĠAustral ia +Ġb rief +Q u +Ġbr and +ill ing +ab led +Ġhig hest +Ġtr ain +ĠComm ission +wh ile +Ġn om +cept ion +Ġm ut +ĠBl ue +Ġinc ident +v ant +8 6 +ĠI D +Ġn uclear +7 4 +ĠL ike +ĠR E +ĠM icro +l i +m ail +Ġcharg es +8 9 +Ġad just +ad o +Ġear th +N A +Ġpr ices +P A +Ġd raft +Ġrun s +Ġcandid ate +ens es +Ġmanag ement +ĠPh il +ĠM iss +Ġte ach +g ram +Ġunderstand ing +a it +ic ago +A dd +ĠE p +sec ut +Ġsepar ate +Ġinst ance +Ġe th +Ġun less +**** **** +ĠF ore +in ate +Ġoper ations +S p +Ġf aith +g ar +ĠCh urch +ron ic +Ġconf ig +os ure +Ġactiv ities +Ġtrad itional +Ġ3 6 +Ġd irection +Ġmach ine +Ġsur round +Ġp ush +un ction +ĠE U +Ġeas ier +Ġarg ument +G B +Ġm icro +Ġsp ending +iz ations +Ġthe ory +ad ow +Ġcall ing +ĠL ast +Ġd er +Ġinflu ence +Ġcomm it +Ġph oto +Ġun c +ist ry +g n +ast e +ack s +Ġdis p +ad y +d o +ĠG ood +Ġ ` +Ġw ish +Ġreve aled +Âł Âł +l ig +Ġen force +ĠComm ittee +Ġche m +Ġmil es +Ġinterest ed +Ġsol ution +ic y +in ct +Ġ- > +ĠD et +Ġrem oved +Ġcomp ar +e ah +Ġpl ant +ĠS ince +Ġachie ve +Ġadvant age +Ġslight ly +b ing +Ġpl aced +u nder +201 5 +ĠM ad +Ġt im +os es +Ġc ru +ĠR ock +Ġmost ly +Ġneg ative +Ġset ting +Ġprodu ced +Ġm ur +Ġconnect ion +ĠM er +Ġdri ver +Ġexecut ive +Ġass ault +Ġb orn +ĠV er +t ained +Ġstruct ure +Ġredu ce +Ġdec ades +Ġd ed +u ke +ĠM any +idd en +Ġle ague +S e +Ġjo in +Ġdis co +Ġd ie +c ks +act ions +Ġass ess +ag n +Ġgo als +our s +I R +Ġsen ior +ill er +m od +ip ment +oc ol +u y +ĠQ ue +Ġpart ies +ir gin +Ġle arning +it able +Ġstre et +Ġcamer a +A pp +Ġsk ills +b re +c ious +Ġcele br +ĠFr anc +Ġexist ing +Ġwill ing +l or +Ġ id +ĠSp ace +Ġcrit ical +ĠL a +ortun ately +Ġser ve +Ġc old +Ġspec ies +T S +Ġanim als +ĠB ay +Ġold er +ĠU nder +est ic +ĠT re +Ġte acher +Ġpre fer +v is +Ġth read +ĠM att +Ġmanag er +ãĥ » +Ġprofess ional +ĠV ol +Ġnot es +The se +ul a +Ġf resh +ent ed +u zz +ed y +clus ion +ĠR el +Ġdoub t +E O +Ġopen ed +ĠB it +Ad vertisement +Ġgu ess +ĠU N +Ġse qu +Ġexpl ain +ott en +Ġatt ract +ak s +Ġstr ing +Ġcont ext +oss ible +ĠRepublic ans +Ġsol id +Ġc ities +Ġask ing +Ġr andom +u ps +ur ies +ar ant +dd en +g l +ĠFlor ida +Ġdep end +ĠSc ott +Ġ3 3 +Ġi T +ic on +Ġmention ed +Ġ2 000 +Ġclaim ed +Ġdefin itely +ul f +Ġc ore +Ġopen ing +ĠCon st +wh ich +ĠT ra +A G +7 2 +Ġbelie ved +ad a +Ġ4 8 +ĠSec urity +yr ight +ĠP et +ĠL ou +Ġhold ing +======== ======== +Ġ ice +Ġb row +Ġauthor ities +h ost +w ord +Ġsc ore +ĠD iv +Ġcell s +Ġtrans l +Ġneigh bor +Ġrem ove +u ct +Ġdist rict +ĠA ccording +Ġwor se +Ġconcern s +Ġpresident ial +Ġpolic ies +ĠH all +7 3 +Ġh us +A Y +Ġ200 6 +ĠJ ud +Ġindepend ent +ĠJust ice +ili ar +pr int +igh ter +Ġprotect ion +z en +Ġsu dden +h ouse +ĠJ es +P R +ĠIn f +Ġb ul +Ġ _ +ĠServ ice +ĠP R +Ġstr ategy +ff ect +Ġgirl s +Ġmiss ing +oy al +ĠTe am +ul ated +Ġd at +Ġpolit ics +ab or +A ccording +Ġspe ll +Ġg raph +ort hern +T C +A b +Ġlab or +is her +Ġk ick +ĠiT unes +Ġstep s +pos es +Ġsmall er +E n +ber t +Ġro ll +Ġresear chers +Ġcl osed +Ġtrans port +Ġlaw y +________ ________ +ĠCh icago +Ġas pect +Ġn one +Ġmar riage +9 6 +Ġe lements +ĠF re +ĠS al +Ġd ram +F C +t op +e qu +Ġhe aring +Ġsupport ed +Ġtest ing +co hol +Ġmass ive +Ġst ick +Ġgu ard +is co +ph one +F rom +How ever +Ġb order +Ġcop y +ograph y +l ist +7 1 +Ġown er +cl ass +ru it +r ate +ĠO nce +Ġdig ital +Ġt ask +ER S +Ġinc red +t es ++ + +ĠFr ance +Ġb reat +ow l +Ġiss ued +ĠW estern +Ġdet ect +Ġpart ners +Ġsh ared +ĠC all +Ġcan cer +ac he +rib e +Ġexpl ained +Ġhe at +{ " +Ġinvest ment +ĠB ook +Ġw ood +Ġtool s +ĠAl though +Ġbelie f +Ġcris is +Ġg e +ĠM P +Ġoper ation +ty pe +~ ~ +g a +Ġcont ains +ant a +Ġexp ress +ĠG roup +ĠJ ournal +k a +Ġam b +ĠUS A +Ġfind ing +Ġfund ing +h ow +Ġestab lished +ide os +Ġdeg ree +Ġdanger ous +ang ing +Ġfre edom +pp ort +out hern +Ġch urch +Ġc atch +ĠTw o +Ġpres ence +ĠGu ard +U p +Ġauthor ity +ĠPro ject +Ġbut ton +Ġcon sequ +Ġval id +Ġwe ak +Ġstart s +Ġref erence +ĠM em +" ) +U N +or age +ĠO pen +Ġcol lection +y m +g ency +Ġbeaut iful +ro s +Ġtell s +Ġwa iting +n el +Ġprov iding +ĠDemocr ats +Ġd aughter +Ġm aster +Ġpur poses +ĠJapan ese +Ġequ al +Ġturn s +Ġdoc uments +Ġwatch ing +R es +Ġr an +201 4 +Ġre ject +ĠKore a +Ġvictim s +Le vel +ere nces +Ġw itness +Ġ3 4 +Ġre form +com ing +Ġocc up +Ġc aught +Ġtra ffic +ad ing +Ġmod els +ar io +Ġserv ed +Ġb atter +u ate +ĠSecret ary +Ġagre ed +Ġtr uly +yn am +ĠR et +Ġun its +ĠRes earch +h and +az ine +ĠM ike +Ġvar iety +ot al +Ġam azing diff --git a/tests/data/t0/gpt2-tiny-vocab.json b/tests/data/t0/gpt2-tiny-vocab.json new file mode 100644 index 000000000..f83c21b01 --- /dev/null +++ b/tests/data/t0/gpt2-tiny-vocab.json @@ -0,0 +1 @@ +{"!":0,"\"":1,"#":2,"$":3,"%":4,"&":5,"'":6,"(":7,")":8,"*":9,"+":10,",":11,"-":12,".":13,"/":14,"0":15,"1":16,"2":17,"3":18,"4":19,"5":20,"6":21,"7":22,"8":23,"9":24,":":25,";":26,"<":27,"=":28,">":29,"?":30,"@":31,"A":32,"B":33,"C":34,"D":35,"E":36,"F":37,"G":38,"H":39,"I":40,"J":41,"K":42,"L":43,"M":44,"N":45,"O":46,"P":47,"Q":48,"R":49,"S":50,"T":51,"U":52,"V":53,"W":54,"X":55,"Y":56,"Z":57,"[":58,"\\":59,"]":60,"^":61,"_":62,"`":63,"a":64,"b":65,"c":66,"d":67,"e":68,"f":69,"g":70,"h":71,"i":72,"j":73,"k":74,"l":75,"m":76,"n":77,"o":78,"p":79,"q":80,"r":81,"s":82,"t":83,"u":84,"v":85,"w":86,"x":87,"y":88,"z":89,"{":90,"|":91,"}":92,"~":93,"¡":94,"¢":95,"£":96,"¤":97,"¥":98,"¦":99,"§":100,"¨":101,"©":102,"ª":103,"«":104,"¬":105,"®":106,"¯":107,"°":108,"±":109,"²":110,"³":111,"´":112,"µ":113,"¶":114,"·":115,"¸":116,"¹":117,"º":118,"»":119,"¼":120,"½":121,"¾":122,"¿":123,"À":124,"Á":125,"Â":126,"Ã":127,"Ä":128,"Å":129,"Æ":130,"Ç":131,"È":132,"É":133,"Ê":134,"Ë":135,"Ì":136,"Í":137,"Î":138,"Ï":139,"Ð":140,"Ñ":141,"Ò":142,"Ó":143,"Ô":144,"Õ":145,"Ö":146,"×":147,"Ø":148,"Ù":149,"Ú":150,"Û":151,"Ü":152,"Ý":153,"Þ":154,"ß":155,"à":156,"á":157,"â":158,"ã":159,"ä":160,"å":161,"æ":162,"ç":163,"è":164,"é":165,"ê":166,"ë":167,"ì":168,"í":169,"î":170,"ï":171,"ð":172,"ñ":173,"ò":174,"ó":175,"ô":176,"õ":177,"ö":178,"÷":179,"ø":180,"ù":181,"ú":182,"û":183,"ü":184,"ý":185,"þ":186,"ÿ":187,"Ā":188,"ā":189,"Ă":190,"ă":191,"Ą":192,"ą":193,"Ć":194,"ć":195,"Ĉ":196,"ĉ":197,"Ċ":198,"ċ":199,"Č":200,"č":201,"Ď":202,"ď":203,"Đ":204,"đ":205,"Ē":206,"ē":207,"Ĕ":208,"ĕ":209,"Ė":210,"ė":211,"Ę":212,"ę":213,"Ě":214,"ě":215,"Ĝ":216,"ĝ":217,"Ğ":218,"ğ":219,"Ġ":220,"ġ":221,"Ģ":222,"ģ":223,"Ĥ":224,"ĥ":225,"Ħ":226,"ħ":227,"Ĩ":228,"ĩ":229,"Ī":230,"ī":231,"Ĭ":232,"ĭ":233,"Į":234,"į":235,"İ":236,"ı":237,"IJ":238,"ij":239,"Ĵ":240,"ĵ":241,"Ķ":242,"ķ":243,"ĸ":244,"Ĺ":245,"ĺ":246,"Ļ":247,"ļ":248,"Ľ":249,"ľ":250,"Ŀ":251,"ŀ":252,"Ł":253,"ł":254,"Ń":255,"Ġt":256,"Ġa":257,"he":258,"in":259,"re":260,"on":261,"Ġthe":262,"er":263,"Ġs":264,"at":265,"Ġw":266,"Ġo":267,"en":268,"Ġc":269,"it":270,"is":271,"an":272,"or":273,"es":274,"Ġb":275,"ed":276,"Ġf":277,"ing":278,"Ġp":279,"ou":280,"Ġan":281,"al":282,"ar":283,"Ġto":284,"Ġm":285,"Ġof":286,"Ġin":287,"Ġd":288,"Ġh":289,"Ġand":290,"ic":291,"as":292,"le":293,"Ġth":294,"ion":295,"om":296,"ll":297,"ent":298,"Ġn":299,"Ġl":300,"st":301,"Ġre":302,"ve":303,"Ġe":304,"ro":305,"ly":306,"Ġbe":307,"Ġg":308,"ĠT":309,"ct":310,"ĠS":311,"id":312,"ot":313,"ĠI":314,"ut":315,"et":316,"ĠA":317,"Ġis":318,"Ġon":319,"im":320,"am":321,"ow":322,"ay":323,"ad":324,"se":325,"Ġthat":326,"ĠC":327,"ig":328,"Ġfor":329,"ac":330,"Ġy":331,"ver":332,"ur":333,"Ġu":334,"ld":335,"Ġst":336,"ĠM":337,"'s":338,"Ġhe":339,"Ġit":340,"ation":341,"ith":342,"ir":343,"ce":344,"Ġyou":345,"il":346,"ĠB":347,"Ġwh":348,"ol":349,"ĠP":350,"Ġwith":351,"Ġ1":352,"ter":353,"ch":354,"Ġas":355,"Ġwe":356,"Ġ(":357,"nd":358,"ill":359,"ĠD":360,"if":361,"Ġ2":362,"ag":363,"ers":364,"ke":365,"Ġ\"":366,"ĠH":367,"em":368,"Ġcon":369,"ĠW":370,"ĠR":371,"her":372,"Ġwas":373,"Ġr":374,"od":375,"ĠF":376,"ul":377,"ate":378,"Ġat":379,"ri":380,"pp":381,"ore":382,"ĠThe":383,"Ġse":384,"us":385,"Ġpro":386,"Ġha":387,"um":388,"Ġare":389,"Ġde":390,"ain":391,"and":392,"Ġor":393,"igh":394,"est":395,"ist":396,"ab":397,"rom":398,"ĠN":399,"th":400,"Ġcom":401,"ĠG":402,"un":403,"op":404,"00":405,"ĠL":406,"Ġnot":407,"ess":408,"Ġex":409,"Ġv":410,"res":411,"ĠE":412,"ew":413,"ity":414,"ant":415,"Ġby":416,"el":417,"os":418,"ort":419,"oc":420,"qu":421,"Ġfrom":422,"Ġhave":423,"Ġsu":424,"ive":425,"ould":426,"Ġsh":427,"Ġthis":428,"nt":429,"ra":430,"pe":431,"ight":432,"art":433,"ment":434,"Ġal":435,"ust":436,"end":437,"--":438,"all":439,"ĠO":440,"ack":441,"Ġch":442,"Ġle":443,"ies":444,"red":445,"ard":446,"âĢ":447,"out":448,"ĠJ":449,"Ġab":450,"ear":451,"iv":452,"ally":453,"our":454,"ost":455,"gh":456,"pt":457,"Ġpl":458,"ast":459,"Ġcan":460,"ak":461,"ome":462,"ud":463,"The":464,"Ġhis":465,"Ġdo":466,"Ġgo":467,"Ġhas":468,"ge":469,"'t":470,"ĠU":471,"rou":472,"Ġsa":473,"Ġj":474,"Ġbut":475,"Ġwor":476,"Ġall":477,"ect":478,"Ġk":479,"ame":480,"Ġwill":481,"ok":482,"Ġwhe":483,"Ġthey":484,"ide":485,"01":486,"ff":487,"ich":488,"pl":489,"ther":490,"Ġtr":491,"..":492,"Ġint":493,"ie":494,"ure":495,"age":496,"Ġne":497,"ial":498,"ap":499,"ine":500,"ice":501,"Ġme":502,"Ġout":503,"ans":504,"one":505,"ong":506,"ions":507,"Ġwho":508,"ĠK":509,"Ġup":510,"Ġtheir":511,"Ġad":512,"Ġ3":513,"Ġus":514,"ated":515,"ous":516,"Ġmore":517,"ue":518,"og":519,"ĠSt":520,"ind":521,"ike":522,"Ġso":523,"ime":524,"per":525,".\"":526,"ber":527,"iz":528,"act":529,"Ġone":530,"Ġsaid":531,"Ġ-":532,"are":533,"Ġyour":534,"cc":535,"ĠTh":536,"Ġcl":537,"ep":538,"ake":539,"able":540,"ip":541,"Ġcont":542,"Ġwhich":543,"ia":544,"Ġim":545,"Ġabout":546,"Ġwere":547,"very":548,"ub":549,"Ġhad":550,"Ġen":551,"Ġcomp":552,",\"":553,"ĠIn":554,"Ġun":555,"Ġag":556,"ire":557,"ace":558,"au":559,"ary":560,"Ġwould":561,"ass":562,"ry":563,"ĠâĢ":564,"cl":565,"ook":566,"ere":567,"so":568,"ĠV":569,"ign":570,"ib":571,"Ġoff":572,"Ġte":573,"ven":574,"ĠY":575,"ile":576,"ose":577,"ite":578,"orm":579,"Ġ201":580,"Ġres":581,"Ġman":582,"Ġper":583,"Ġother":584,"ord":585,"ult":586,"Ġbeen":587,"Ġlike":588,"ase":589,"ance":590,"ks":591,"ays":592,"own":593,"ence":594,"Ġdis":595,"ction":596,"Ġany":597,"Ġapp":598,"Ġsp":599,"int":600,"ress":601,"ations":602,"ail":603,"Ġ4":604,"ical":605,"Ġthem":606,"Ġher":607,"ount":608,"ĠCh":609,"Ġar":610,"Ġif":611,"Ġthere":612,"Ġpe":613,"Ġyear":614,"av":615,"Ġmy":616,"Ġsome":617,"Ġwhen":618,"ough":619,"ach":620,"Ġthan":621,"ru":622,"ond":623,"ick":624,"Ġover":625,"vel":626,"Ġqu":627,"ĊĊ":628,"Ġsc":629,"reat":630,"ree":631,"ĠIt":632,"ound":633,"port":634,"Ġalso":635,"Ġpart":636,"fter":637,"Ġkn":638,"Ġbec":639,"Ġtime":640,"ens":641,"Ġ5":642,"ople":643,"Ġwhat":644,"Ġno":645,"du":646,"mer":647,"ang":648,"Ġnew":649,"----":650,"Ġget":651,"ory":652,"ition":653,"ings":654,"Ġjust":655,"Ġinto":656,"Ġ0":657,"ents":658,"ove":659,"te":660,"Ġpeople":661,"Ġpre":662,"Ġits":663,"Ġrec":664,"Ġtw":665,"ian":666,"irst":667,"ark":668,"ors":669,"Ġwork":670,"ade":671,"ob":672,"Ġshe":673,"Ġour":674,"wn":675,"ink":676,"lic":677,"Ġ19":678,"ĠHe":679,"ish":680,"nder":681,"ause":682,"Ġhim":683,"ons":684,"Ġ[":685,"Ġro":686,"form":687,"ild":688,"ates":689,"vers":690,"Ġonly":691,"oll":692,"Ġspe":693,"ck":694,"ell":695,"amp":696,"Ġacc":697,"Ġbl":698,"ious":699,"urn":700,"ft":701,"ood":702,"Ġhow":703,"hed":704,"Ġ'":705,"Ġafter":706,"aw":707,"Ġatt":708,"ov":709,"ne":710,"Ġplay":711,"erv":712,"ict":713,"Ġcould":714,"itt":715,"Ġam":716,"Ġfirst":717,"Ġ6":718,"Ġact":719,"Ġ$":720,"ec":721,"hing":722,"ual":723,"ull":724,"Ġcomm":725,"oy":726,"old":727,"ces":728,"ater":729,"Ġfe":730,"Ġbet":731,"we":732,"iff":733,"Ġtwo":734,"ock":735,"Ġback":736,").":737,"ident":738,"Ġunder":739,"rough":740,"sel":741,"xt":742,"Ġmay":743,"round":744,"Ġpo":745,"ph":746,"iss":747,"Ġdes":748,"Ġmost":749,"Ġdid":750,"Ġadd":751,"ject":752,"Ġinc":753,"fore":754,"Ġpol":755,"ont":756,"Ġagain":757,"clud":758,"tern":759,"Ġknow":760,"Ġneed":761,"Ġcons":762,"Ġco":763,"Ġ.":764,"Ġwant":765,"Ġsee":766,"Ġ7":767,"ning":768,"iew":769,"ĠThis":770,"ced":771,"Ġeven":772,"Ġind":773,"ty":774,"ĠWe":775,"ath":776,"Ġthese":777,"Ġpr":778,"Ġuse":779,"Ġbecause":780,"Ġfl":781,"ng":782,"Ġnow":783,"ĠâĢĵ":784,"com":785,"ise":786,"Ġmake":787,"Ġthen":788,"ower":789,"Ġevery":790,"ĠUn":791,"Ġsec":792,"oss":793,"uch":794,"Ġem":795,"Ġ=":796,"ĠRe":797,"ied":798,"rit":799,"Ġinv":800,"lect":801,"Ġsupp":802,"ating":803,"Ġlook":804,"man":805,"pect":806,"Ġ8":807,"row":808,"Ġbu":809,"Ġwhere":810,"ific":811,"Ġyears":812,"ily":813,"Ġdiff":814,"Ġshould":815,"Ġrem":816,"Th":817,"In":818,"Ġev":819,"day":820,"'re":821,"rib":822,"Ġrel":823,"ss":824,"Ġdef":825,"Ġright":826,"Ġsy":827,"),":828,"les":829,"000":830,"hen":831,"Ġthrough":832,"ĠTr":833,"__":834,"Ġway":835,"Ġdon":836,"Ġ,":837,"Ġ10":838,"ased":839,"Ġass":840,"ublic":841,"Ġreg":842,"ĠAnd":843,"ix":844,"Ġvery":845,"Ġinclud":846,"other":847,"Ġimp":848,"oth":849,"Ġsub":850,"ĠâĢĶ":851,"Ġbeing":852,"arg":853,"ĠWh":854,"==":855,"ible":856,"Ġdoes":857,"ange":858,"ram":859,"Ġ9":860,"ert":861,"ps":862,"ited":863,"ational":864,"Ġbr":865,"Ġdown":866,"Ġmany":867,"aking":868,"Ġcall":869,"uring":870,"ities":871,"Ġph":872,"ics":873,"als":874,"Ġdec":875,"ative":876,"ener":877,"Ġbefore":878,"ility":879,"Ġwell":880,"Ġmuch":881,"erson":882,"Ġthose":883,"Ġsuch":884,"Ġke":885,"Ġend":886,"ĠBut":887,"ason":888,"ting":889,"Ġlong":890,"ef":891,"Ġthink":892,"ys":893,"Ġbel":894,"Ġsm":895,"its":896,"ax":897,"Ġown":898,"Ġprov":899,"Ġset":900,"ife":901,"ments":902,"ble":903,"ward":904,"Ġshow":905,"Ġpres":906,"ms":907,"omet":908,"Ġob":909,"Ġsay":910,"ĠSh":911,"ts":912,"ful":913,"Ġeff":914,"Ġgu":915,"Ġinst":916,"und":917,"ren":918,"cess":919,"Ġent":920,"ĠYou":921,"Ġgood":922,"Ġstart":923,"ince":924,"Ġmade":925,"tt":926,"stem":927,"olog":928,"up":929,"Ġ|":930,"ump":931,"Ġhel":932,"vern":933,"ular":934,"ually":935,"Ġac":936,"Ġmon":937,"Ġlast":938,"Ġ200":939,"10":940,"Ġstud":941,"ures":942,"ĠAr":943,"self":944,"ars":945,"meric":946,"ues":947,"cy":948,"Ġmin":949,"ollow":950,"Ġcol":951,"io":952,"Ġmod":953,"Ġcount":954,"ĠCom":955,"hes":956,"Ġfin":957,"air":958,"ier":959,"âĢĶ":960,"read":961,"ank":962,"atch":963,"ever":964,"Ġstr":965,"Ġpoint":966,"ork":967,"ĠNew":968,"Ġsur":969,"ool":970,"alk":971,"ement":972,"Ġused":973,"ract":974,"ween":975,"Ġsame":976,"oun":977,"ĠAl":978,"ci":979,"Ġdiffere":980,"Ġwhile":981,"--------":982,"Ġgame":983,"cept":984,"Ġsim":985,"...":986,"Ġinter":987,"ek":988,"Ġreport":989,"Ġprodu":990,"Ġstill":991,"led":992,"ah":993,"Ġhere":994,"Ġworld":995,"Ġthough":996,"Ġnum":997,"arch":998,"imes":999,"ale":1000,"ĠSe":1001,"ĠIf":1002,"//":1003,"ĠLe":1004,"Ġret":1005,"Ġref":1006,"Ġtrans":1007,"ner":1008,"ution":1009,"ters":1010,"Ġtake":1011,"ĠCl":1012,"Ġconf":1013,"way":1014,"ave":1015,"Ġgoing":1016,"Ġsl":1017,"ug":1018,"ĠAmeric":1019,"Ġspec":1020,"Ġhand":1021,"Ġbetween":1022,"ists":1023,"ĠDe":1024,"oot":1025,"It":1026,"Ġear":1027,"Ġagainst":1028,"Ġhigh":1029,"gan":1030,"az":1031,"ather":1032,"Ġexp":1033,"Ġop":1034,"Ġins":1035,"Ġgr":1036,"Ġhelp":1037,"Ġrequ":1038,"ets":1039,"ins":1040,"ĠPro":1041,"ism":1042,"Ġfound":1043,"land":1044,"ata":1045,"uss":1046,"ames":1047,"Ġperson":1048,"Ġgreat":1049,"pr":1050,"Ġsign":1051,"ĠAn":1052,"'ve":1053,"Ġsomet":1054,"Ġser":1055,"hip":1056,"Ġrun":1057,"Ġ:":1058,"Ġter":1059,"irect":1060,"Ġfollow":1061,"Ġdet":1062,"ices":1063,"Ġfind":1064,"12":1065,"Ġmem":1066,"Ġcr":1067,"ered":1068,"ex":1069,"Ġext":1070,"uth":1071,"ense":1072,"co":1073,"Ġteam":1074,"ving":1075,"ouse":1076,"ash":1077,"att":1078,"ved":1079,"Ġsystem":1080,"ĠAs":1081,"der":1082,"ives":1083,"min":1084,"Ġlead":1085,"ĠBl":1086,"cent":1087,"Ġaround":1088,"Ġgovern":1089,"Ġcur":1090,"velop":1091,"any":1092,"Ġcour":1093,"alth":1094,"ages":1095,"ize":1096,"Ġcar":1097,"ode":1098,"Ġlaw":1099,"Ġread":1100,"'m":1101,"con":1102,"Ġreal":1103,"Ġsupport":1104,"Ġ12":1105,"....":1106,"Ġreally":1107,"ness":1108,"Ġfact":1109,"Ġday":1110,"Ġboth":1111,"ying":1112,"Ġserv":1113,"ĠFor":1114,"Ġthree":1115,"Ġwom":1116,"Ġmed":1117,"ody":1118,"ĠThey":1119,"50":1120,"Ġexper":1121,"ton":1122,"Ġeach":1123,"akes":1124,"Ġche":1125,"Ġcre":1126,"ines":1127,"Ġrep":1128,"19":1129,"gg":1130,"illion":1131,"Ġgrou":1132,"ute":1133,"ik":1134,"We":1135,"get":1136,"ER":1137,"Ġmet":1138,"Ġsays":1139,"ox":1140,"Ġduring":1141,"ern":1142,"ized":1143,"ared":1144,"Ġfam":1145,"ically":1146,"Ġhapp":1147,"ĠIs":1148,"Ġchar":1149,"med":1150,"vent":1151,"Ġgener":1152,"ient":1153,"ple":1154,"iet":1155,"rent":1156,"11":1157,"ves":1158,"ption":1159,"Ġ20":1160,"formation":1161,"Ġcor":1162,"Ġoffic":1163,"ield":1164,"Ġtoo":1165,"ision":1166,"Ġinf":1167,"ĠZ":1168,"the":1169,"oad":1170,"Ġpublic":1171,"Ġprog":1172,"ric":1173,"**":1174,"Ġwar":1175,"Ġpower":1176,"view":1177,"Ġfew":1178,"Ġloc":1179,"Ġdifferent":1180,"Ġstate":1181,"Ġhead":1182,"'ll":1183,"Ġposs":1184,"Ġstat":1185,"ret":1186,"ants":1187,"Ġval":1188,"Ġiss":1189,"Ġcle":1190,"ivers":1191,"anc":1192,"Ġexpl":1193,"Ġanother":1194,"ĠQ":1195,"Ġav":1196,"thing":1197,"nce":1198,"Wh":1199,"Ġchild":1200,"Ġsince":1201,"ired":1202,"less":1203,"Ġlife":1204,"Ġdevelop":1205,"ittle":1206,"Ġdep":1207,"Ġpass":1208,"ãĥ":1209,"Ġturn":1210,"orn":1211,"This":1212,"bers":1213,"ross":1214,"ĠAd":1215,"Ġfr":1216,"Ġresp":1217,"Ġsecond":1218,"oh":1219,"Ġ/":1220,"Ġdisc":1221,"Ġ&":1222,"Ġsomething":1223,"Ġcomple":1224,"Ġed":1225,"Ġfil":1226,"Ġmonth":1227,"aj":1228,"uc":1229,"Ġgovernment":1230,"Ġwithout":1231,"Ġleg":1232,"Ġdist":1233,"Ġput":1234,"Ġquest":1235,"ann":1236,"Ġprot":1237,"20":1238,"Ġnever":1239,"ience":1240,"Ġlevel":1241,"Ġart":1242,"Ġthings":1243,"Ġmight":1244,"Ġeffect":1245,"Ġcontro":1246,"Ġcent":1247,"Ġ18":1248,"Ġallow":1249,"Ġbelie":1250,"chool":1251,"ott":1252,"Ġincre":1253,"Ġfeel":1254,"Ġresult":1255,"Ġlot":1256,"Ġfun":1257,"ote":1258,"Ġty":1259,"erest":1260,"Ġcontin":1261,"Ġusing":1262,"Ġbig":1263,"201":1264,"Ġask":1265,"Ġbest":1266,"Ġ)":1267,"IN":1268,"Ġopp":1269,"30":1270,"Ġnumber":1271,"iness":1272,"St":1273,"lease":1274,"Ġca":1275,"Ġmust":1276,"Ġdirect":1277,"Ġgl":1278,"Ġ<":1279,"Ġopen":1280,"Ġpost":1281,"Ġcome":1282,"Ġseem":1283,"ording":1284,"Ġweek":1285,"ately":1286,"ital":1287,"Ġel":1288,"riend":1289,"Ġfar":1290,"Ġtra":1291,"inal":1292,"Ġpri":1293,"ĠUS":1294,"Ġplace":1295,"Ġform":1296,"Ġtold":1297,"\":":1298,"ains":1299,"ature":1300,"ĠTrump":1301,"Ġstand":1302,"Ġ#":1303,"ider":1304,"ĠFr":1305,"Ġnext":1306,"Ġsoc":1307,"Ġpur":1308,"Ġlet":1309,"Ġlittle":1310,"Ġhum":1311,"Ġi":1312,"ron":1313,"15":1314,"Ġ15":1315,"Ġcommun":1316,"Ġmark":1317,"ĠThere":1318,"Ġwr":1319,"ĠThat":1320,"Ġinformation":1321,"ways":1322,"Ġbus":1323,"app":1324,"Ġinvest":1325,"me":1326,"Ġhard":1327,"ained":1328,"ead":1329,"Ġimport":1330,"Ġappro":1331,"Ġtest":1332,"Ġtri":1333,"Ġrest":1334,"osed":1335,"Ġfull":1336,"Ġcare":1337,"ĠSp":1338,"Ġcase":1339,"ON":1340,"Ġsk":1341,"Ġless":1342,"Ġ+":1343,"Ġpartic":1344,"ĠPl":1345,"ably":1346,"uck":1347,"ished":1348,"chn":1349,"be":1350,"Ġlist":1351,"ator":1352,"Ġtop":1353,"Ġadv":1354,"ĠBe":1355,"ruct":1356,"Ġdem":1357,"ration":1358,"ling":1359,"gy":1360,"reen":1361,"ger":1362,"Ġhome":1363,"Ġleft":1364,"Ġbetter":1365,"Ġdata":1366,"Ġ11":1367,"Ġattack":1368,"Ġproble":1369,"line":1370,"ards":1371,"Ġbeh":1372,"ral":1373,"ĠHow":1374,"ĠShe":1375,"arge":1376,"Ġ--":1377,"://":1378,"Ġbro":1379,"ĠPh":1380,"ats":1381,"Ġbuild":1382,"ww":1383,"ided":1384,"aim":1385,"ases":1386,"ency":1387,"Ġmain":1388,"ined":1389,"Ġincluding":1390,"Ġ{":1391,"Ġgot":1392,"Ġinterest":1393,"Ġkeep":1394,"ĠX":1395,"Ġeas":1396,"aining":1397,"Ġclass":1398,"âĢ¦":1399,"ĠNo":1400,"Ġvar":1401,"Ġsmall":1402,"ample":1403,"AT":1404,"Ġide":1405,"ĠSo":1406,"Ġrece":1407,"Ġpolit":1408,"Ġmov":1409,"Ġplan":1410,"Ġpercent":1411,"iving":1412,"Ġcamp":1413,"Ġpay":1414,"14":1415,"sc":1416,"ised":1417,"Ġunt":1418,"oney":1419,"ploy":1420,"====":1421,"Ġdidn":1422,"ĠInd":1423,"els":1424,"ertain":1425,"Ġpos":1426,"____":1427,"iver":1428,"Ġprocess":1429,"Ġprogram":1430,"ified":1431,"ĠRep":1432,"16":1433,"uro":1434,"ology":1435,"atter":1436,"ina":1437,"Ġname":1438,"ĠAll":1439,"Ġfour":1440,"Ġreturn":1441,"vious":1442,"bs":1443,"Ġcalled":1444,"Ġmove":1445,"ĠSc":1446,"ird":1447,"Ġgroup":1448,"Ġbre":1449,"Ġmen":1450,"Ġcap":1451,"ten":1452,"ee":1453,"Ġdri":1454,"leg":1455,"here":1456,"uthor":1457,"Ġpat":1458,"Ġcurrent":1459,"ides":1460,"Ġpop":1461,"to":1462,"ention":1463,"Ġalways":1464,"Ġmil":1465,"Ġwomen":1466,"Ġ16":1467,"Ġold":1468,"iven":1469,"raph":1470,"ĠOr":1471,"ror":1472,"ently":1473,"Ġnear":1474,"ĠEx":1475,"ream":1476,"sh":1477,"Ġ14":1478,"Ġfree":1479,"ission":1480,"stand":1481,"ĠCon":1482,"ality":1483,"used":1484,"13":1485,"Ġdesign":1486,"Ġchange":1487,"Ġchang":1488,"Ġbo":1489,"Ġvis":1490,"ember":1491,"Ġbook":1492,"ready":1493,"Ġkill":1494,"25":1495,"pped":1496,"Ġaway":1497,"Ġable":1498,"Ġcountry":1499,"Ġconst":1500,"arn":1501,"Ġorder":1502,"AR":1503,"ior":1504,"ium":1505,"orth":1506,"18":1507,"ailable":1508,"Ġsw":1509,"Ġmillion":1510,"Ġ13":1511,"atic":1512,"ted":1513,"ĠGo":1514,"Ġoper":1515,"eng":1516,"Ġthing":1517,"ajor":1518,"conom":1519,"ĠComm":1520,"Ġwhy":1521,"ured":1522,"ural":1523,"Ġschool":1524,"by":1525,"ĠMar":1526,"Ġaff":1527,"Ġdays":1528,"Ġann":1529,"ush":1530,"ane":1531,"If":1532,"eg":1533,"Ġprof":1534,"Ġhealth":1535,"outh":1536,"But":1537,"ional":1538,".,":1539,"Ġsol":1540,"Ġalready":1541,"Ġ30":1542,"Ġcharact":1543,"He":1544,"Ġfriend":1545,"ES":1546,"ians":1547,"icle":1548,"'d":1549,"ĠOn":1550,"Ġleast":1551,"Ġprom":1552,"Ġdr":1553,"Ġhist":1554,"ither":1555,"Ġest":1556,"iqu":1557,"17":1558,"son":1559,"Ġtell":1560,"Ġtalk":1561,"ohn":1562,"oint":1563,"lection":1564,"AN":1565,"Ġuntil":1566,"augh":1567,"Ġlater":1568,"Ġve":1569,"Ġview":1570,"ending":1571,"ived":1572,"Ġword":1573,"ware":1574,"Ġcost":1575,"Ġenough":1576,"Ġgive":1577,"ĠUnited":1578,"Ġtechn":1579,"arent":1580,"OR":1581,"Ġpar":1582,"ĠDr":1583,"Ġ2016":1584,"rist":1585,"ering":1586,"ĠÂ":1587,"Ġlarge":1588,"side":1589,"acy":1590,"ccess":1591,"Ġwin":1592,"Ġimportant":1593,"Ġ199":1594,"Ġdoesn":1595,"Ġ17":1596,"Ġbusiness":1597,"Ġclear":1598,"Ġrese":1599,"\",":1600,"ury":1601,"Ġequ":1602,"aster":1603,"alf":1604,"ĠAmerican":1605,"nect":1606,"Ġexpect":1607,"iversity":1608,"Ġocc":1609,"ĠFl":1610,"Ġkind":1611,"Ġmean":1612,"Ġpast":1613,"Ġdev":1614,"Ġbas":1615,"let":1616,"raft":1617,"Ġorgan":1618,"Ġdel":1619,"Ġperform":1620,"Ġstory":1621,"Ġseason":1622,"ĠCol":1623,"Ġclaim":1624,"Ġcame":1625,"Ġwithin":1626,"Ġline":1627,"Ġproject":1628,"ĠAt":1629,"Ġcontrol":1630,"ended":1631,"ĠSy":1632,"Ġair":1633,"ization":1634,"Ġ*":1635,"ley":1636,"Ġmoney":1637,"idd":1638,"You":1639,"for":1640,"Ġfamily":1641,"Ġmaking":1642,"Ġbit":1643,"Ġpolice":1644,"Ġhappen":1645,"Ġvers":1646,"ony":1647,"uff":1648,"ĠWhen":1649,"Ġsit":1650,"ideo":1651,"lf":1652,"ison":1653,"Ġsure":1654,"gin":1655,"Ġappear":1656,"Ġlight":1657,"Ġes":1658,"of":1659,"Ġwater":1660,"Ġtimes":1661,"not":1662,"Ġgrow":1663,"Ġcompany":1664,"ĠTe":1665,"ows":1666,"Ġmar":1667,"ource":1668,"iol":1669,"arm":1670,"br":1671,"Ġexample":1672,"Ġconc":1673,"Ġfore":1674,"ĠTo":1675,"pro":1676,"EN":1677,"ries":1678,"Ġ25":1679,"ĠCan":1680,"ney":1681,"Ġactually":1682,"Ġever":1683,"urity":1684,"aken":1685,"aps":1686,"Ġtax":1687,"Ġmajor":1688,"ama":1689,"Ġoften":1690,"eral":1691,"Ġhuman":1692,"Ġjob":1693,"ister":1694,"Ġavailable":1695,"ocr":1696,"enn":1697,"aid":1698,"ivid":1699,"Ġrecord":1700,"?\"":1701,"Ġsing":1702,"ĠAm":1703,"idence":1704,"Ġnews":1705,"ster":1706,"Ġeconom":1707,"Ġfollowing":1708,"ĠBr":1709,"ising":1710,"Ġhour":1711,"most":1712,"ument":1713,"Ġsex":1714,"Ġdesc":1715,"Ġbecome":1716,"ĠEd":1717,"Ġtook":1718,"Ġhaving":1719,"Ġproduct":1720,"ault":1721,"As":1722,"aring":1723,"Ġmeans":1724,"Ġhop":1725,"une":1726,"Ġcho":1727,"Ġcertain":1728,"Ġnon":1729,"Ġdeal":1730,"24":1731,"lement":1732,"oci":1733,"ene":1734,"Ġside":1735,"ĠPr":1736,"ĠMay":1737,"Ġreason":1738,"ued":1739,"ched":1740,"ulation":1741,"Ġelect":1742,"Ġofficial":1743,"Ġpossible":1744,"Ġhold":1745,"ands":1746,"ots":1747,"Ġcity":1748,"ories":1749,"Ġsever":1750,"Ġchildren":1751,"Ġonce":1752,"Ġactiv":1753,"ler":1754,"Ġnight":1755,"itions":1756,"ĠJohn":1757,"ape":1758,"play":1759,"Ġdone":1760,"Ġlim":1761,"Ġworking":1762,"ĠPres":1763,"orld":1764,"eb":1765,"ĠCo":1766,"Ġbody":1767,"ails":1768,"utes":1769,"ĠMr":1770,"Ġwhether":1771,"Ġauthor":1772,"rop":1773,"Ġproper":1774,"Ġseen":1775,");":1776,"Ġfac":1777,"ĠSu":1778,"Ġcond":1779,"iting":1780,"Ġcourse":1781,"Ġ}":1782,"----------------":1783,"aign":1784,"Ġevent":1785,"Ġeng":1786,"Ġpot":1787,"Ġintern":1788,"iam":1789,"Ġshort":1790,"empt":1791,"ãĤ":1792,"ĠGod":1793,"ilar":1794,"80":1795,"Ġorig":1796,"IS":1797,"ourn":1798,"ability":1799,"itive":1800,"Ġdam":1801,"Ġ100":1802,"Ġpress":1803,"Ġdoing":1804,"Ġprotect":1805,"ring":1806,"Ġthought":1807,"Ġquestion":1808,"rew":1809,"ĠWar":1810,"Ġseveral":1811,"ĠState":1812,"Ġgiven":1813,"Ġfund":1814,"ĠTw":1815,"Ġwent":1816,"ances":1817,"work":1818,"por":1819,"my":1820,"40":1821,"Ġarg":1822,"artment":1823,"ustom":1824,"Ġpolic":1825,"Ġmeet":1826,"Ġcreat":1827,"22":1828,"ĠStates":1829,"Ġgames":1830,"raw":1831,"uture":1832,"Ġunderstand":1833,"urs":1834,"ĠOb":1835,"lish":1836,"sy":1837,"Ġmakes":1838,"Ġwon":1839,"agon":1840,"Ġhtt":1841,"Ġlove":1842,"ential":1843,"Ġcomplete":1844,"par":1845,"ĠIm":1846,"AL":1847,"Ġaccount":1848,"Âł":1849,"ored":1850,"vert":1851,"Ġident":1852,"Ġ2015":1853,"Ġothers":1854,"ĠMin":1855,"iber":1856,"verage":1857,"There":1858,"itional":1859,"dd":1860,"Ġprob":1861,"Ġyoung":1862,"Ġalong":1863,"Ġaccording":1864,"Ġyet":1865,"Ġmembers":1866,"ĠWhat":1867,"oid":1868,"ĠMan":1869,"And":1870,"Ġamong":1871,"ai":1872,"Ġemploy":1873,"ĠRes":1874,"Ġ>":1875,"Ġinvol":1876,"Ġlow":1877,"af":1878,"ĠCar":1879,"Ġhig":1880,"ĠOne":1881,"ĠSec":1882,"ination":1883,"Ġlikely":1884,"Ġant":1885,"aged":1886,"ĠRuss":1887,"Ġben":1888,"Ġrele":1889,"For":1890,"back":1891,"ĠNot":1892,"Ġpresident":1893,"ball":1894,"Ġaccess":1895,"ividual":1896,"ĠDem":1897,"ĠEuro":1898,"60":1899,"Ġknown":1900,"irl":1901,"ĠGr":1902,"Ġearly":1903,"use":1904,"iety":1905,"âĢĵ":1906,"Ġfight":1907,"Ġsent":1908,"Ġtoday":1909,"Ġmarket":1910,"\".":1911,"Ġbased":1912,"Ġstrong":1913,"urther":1914,"Ġdeb":1915,"mber":1916,"Ġproblem":1917,"Ġdeath":1918,"Ġsocial":1919,"imate":1920,"AS":1921,"ortun":1922,"Ġcampaign":1923,"ery":1924,"Ch":1925,"Ġey":1926,"ially":1927,"Ġmus":1928,"wh":1929,"pos":1930,"Ġer":1931,"Ġsaf":1932,"Ġmonths":1933,"iron":1934,"Ġviol":1935,"Ġfive":1936,"Ġstre":1937,"Ġplayers":1938,"inc":1939,"ald":1940,"year":1941,"aun":1942,"Ġsuccess":1943,"Ġpresent":1944,"erence":1945,"Ġ2014":1946,"Ġsugg":1947,"Ġparticular":1948,"Ġtry":1949,"Ġsuggest":1950,"ĠChrist":1951,"ones":1952,"Ġpriv":1953,"23":1954,"Ġcrit":1955,"Ġland":1956,"Ġlocal":1957,"ify":1958,"29":1959,"Ġaut":1960,"ED":1961,"ĠGu":1962,"Ġmult":1963,"Ġpolitical":1964,"Ġasked":1965,"Ġformer":1966,"itter":1967,"ript":1968,"Ġclose":1969,"Ġpract":1970,"ĠYork":1971,"Ġgetting":1972,"Ġacross":1973,"Ġcomb":1974,"Ġbelieve":1975,"Ġz":1976,"Ġtoget":1977,"Ġtogether":1978,"ĠCent":1979,"irc":1980,"Ġindividual":1981,"ĠMc":1982,"27":1983,"isk":1984,"ĠEng":1985,"Ġface":1986,"Ġ24":1987,"Ġvalue":1988,"Ġarea":1989,"ev":1990,"Ġwrit":1991,"ĠPresident":1992,"Ġvot":1993,"Ġkey":1994,"Ġmom":1995,"put":1996,"Ġanything":1997,"Ġexperience":1998,"attle":1999,"Ġmind":2000,"aff":2001,"omm":2002,"Ġfuture":2003,"ged":2004,"Ġcut":2005,"Ġtot":2006,"itch":2007,"Ġvideo":2008,"Ġinvestig":2009,"Ġnet":2010,"ĠMy":2011,"rict":2012,"ien":2013,".)":2014,"Ġimpro":2015,"though":2016,"wards":2017,"Ġconnect":2018,"ĠMed":2019,"selves":2020,"ensive":2021,"mb":2022,"ober":2023,"ators":2024,"An":2025,"Ġ50":2026,"Ġredu":2027,"resent":2028,"Ġabove":2029,"Ġfre":2030,"ĠEurope":2031,"sw":2032,"Ġamount":2033,"ĠApp":2034,"Ġeither":2035,"Ġmilit":2036,"Ġanal":2037,"Ġfail":2038,"ĠEn":2039,"ales":2040,"Ġspecial":2041,"Ġblack":2042,"IT":2043,"cher":2044,"Ġlooking":2045,"Ġfire":2046,"yn":2047,"Ġalmost":2048,"oon":2049,"Ġstudy":2050,"Ġmiss":2051,"ches":2052,"rown":2053,"Ġtre":2054,"Ġcommunity":2055,"Ġmedia":2056,"Ġfood":2057,"Ġcomes":2058,"ĠUniversity":2059,"Ġsingle":2060,"What":2061,"uly":2062,"Ġhalf":2063,"ague":2064,"hod":2065,"ĠRepublic":2066,"Ġstarted":2067,"Ġquick":2068,"oto":2069,"book":2070,"Ġissue":2071,"itor":2072,"Ġelse":2073,"Ġconsider":2074,"26":2075,"rodu":2076,"Ġtaken":2077,"28":2078,"99":2079,"ĠWith":2080,"Ġtrue":2081,"Ġwa":2082,"Ġtrad":2083,"Ġago":2084,"Ġmess":2085,"ief":2086,"Ġadded":2087,"oke":2088,"Ġbad":2089,"Ġfav":2090,"33":2091,"Ġsimilar":2092,"ask":2093,"ĠDon":2094,"Ġcharacter":2095,"orts":2096,"ĠHouse":2097,"Ġreported":2098,"Ġtype":2099,"val":2100,"iod":2101,"ĠHowever":2102,"Ġtarg":2103,"Ġentire":2104,"pping":2105,"Ġhistory":2106,"Ġlive":2107,"ffic":2108,"........":2109,"ederal":2110,"Ġtrying":2111,"Ġdiscuss":2112,"ĠHar":2113,"aces":2114,"lished":2115,"Ġself":2116,"osp":2117,"rest":2118,"Ġroom":2119,"elt":2120,"Ġfall":2121,"olution":2122,"Ġet":2123,"Ġx":2124,"Ġisn":2125,"Ġidea":2126,"bo":2127,"Ġsound":2128,"ĠDep":2129,"Ġsomeone":2130,"cially":2131,"ully":2132,"Ġfoc":2133,"Ġobject":2134,"ift":2135,"aper":2136,"Ġplayer":2137,"Ġrather":2138,"Ġservice":2139,"ashing":2140,"ĠDo":2141,"ĠPart":2142,"rug":2143,"mon":2144,"ply":2145,"Ġmor":2146,"Ġnothing":2147,"Ġprovide":2148,"IC":2149,"ung":2150,"Ġparty":2151,"Ġexist":2152,"Ġmag":2153,"70":2154,"Ġrul":2155,"Ġhouse":2156,"Ġbehind":2157,"Ġhowever":2158,"ĠWorld":2159,"Ġsum":2160,"Ġapplic":2161,"Ġ;":2162,"Ġfunction":2163,"gr":2164,"ĠPol":2165,"Ġfront":2166,"200":2167,"Ġseries":2168,"Ġtem":2169,"Ġtyp":2170,"ills":2171,"Ġopt":2172,"Ġpoints":2173,"Ġbelow":2174,"itted":2175,"Ġspecific":2176,"Ġ2017":2177,"umb":2178,"Ġra":2179,"Ġprevious":2180,"Ġpret":2181,"reme":2182,"Ġcustom":2183,"Ġcourt":2184,"ĠMe":2185,"Ġrepl":2186,"Ġwhole":2187,"go":2188,"cer":2189,"Ġtreat":2190,"ĠAct":2191,"Ġprobably":2192,"Ġlearn":2193,"ender":2194,"ĠAss":2195,"Ġversion":2196,"now":2197,"Ġcheck":2198,"ĠCal":2199,"RE":2200,"minist":2201,"On":2202,"ources":2203,"Ġbenef":2204,"Ġdoc":2205,"Ġdeter":2206,"Ġenc":2207,"Ġsuper":2208,"Ġaddress":2209,"Ġvict":2210,"Ġ2013":2211,"Ġmeas":2212,"tr":2213,"Ġfield":2214,"When":2215,"Ġsignific":2216,"uge":2217,"Ġfeat":2218,"Ġcommon":2219,"load":2220,"Ġbegin":2221,"Ġbring":2222,"Ġaction":2223,"erman":2224,"Ġdescrib":2225,"Ġindust":2226,"Ġwanted":2227,"ried":2228,"ming":2229,"Ġattempt":2230,"45":2231,"fer":2232,"Ġdue":2233,"ression":2234,"##":2235,"Ġshall":2236,"Ġsix":2237,"oo":2238,"Ġstep":2239,"Ġpub":2240,"Ġhimself":2241,"Ġ23":2242,"Ġcop":2243,"Ġdest":2244,"Ġstop":2245,"AC":2246,"ibility":2247,"Ġlab":2248,"icult":2249,"Ġhours":2250,"Ġcreate":2251,"Ġfurther":2252,"ĠAmerica":2253,"ĠCity":2254,"Ġdou":2255,"head":2256,"ST":2257,"ĠNorth":2258,"cing":2259,"Ġnational":2260,"ule":2261,"ĠInst":2262,"Ġtaking":2263,"ĠQu":2264,"irt":2265,"Ġred":2266,"Ġresearch":2267,"viron":2268,"ĠGe":2269,"Ġbreak":2270,"ana":2271,"Ġspace":2272,"aterial":2273,"Ġrecent":2274,"ĠAb":2275,"Ġgeneral":2276,"Ġhit":2277,"Ġperiod":2278,"Ġeverything":2279,"ively":2280,"Ġphys":2281,"Ġsaying":2282,"anks":2283,"Ġcou":2284,"Ġcult":2285,"aced":2286,"eal":2287,"uation":2288,"Ġcoun":2289,"lu":2290,"Ġinclude":2291,"Ġposition":2292,"ĠAfter":2293,"ĠCanad":2294,"ĠEm":2295,"Ġimm":2296,"ĠRed":2297,"Ġpick":2298,"Ġcompl":2299,"Ġmatter":2300,"reg":2301,"ext":2302,"angu":2303,"isc":2304,"ole":2305,"aut":2306,"Ġcompet":2307,"eed":2308,"fect":2309,"Ġ21":2310,"ĠSen":2311,"ĠThese":2312,"asing":2313,"Ġcannot":2314,"Ġinit":2315,"Ġrelations":2316,"ached":2317,"Ġbar":2318,"Ġ40":2319,"ĠTH":2320,"Ġ2012":2321,"Ġvol":2322,"Ġground":2323,"Ġsecurity":2324,"Ġupd":2325,"ilt":2326,"35":2327,"Ġconcern":2328,"ĠJust":2329,"Ġwhite":2330,"Ġseems":2331,"ĠHer":2332,"pecially":2333,"ients":2334,"Ġannoun":2335,"Ġfig":2336,"ights":2337,"Ġstri":2338,"like":2339,"ids":2340,"Ġsus":2341,"Ġwatch":2342,"Ġâ":2343,"Ġwind":2344,"ĠCont":2345,"Ġitself":2346,"Ġmass":2347,"Al":2348,"yle":2349,"ique":2350,"ĠNational":2351,"Ġabs":2352,"Ġpack":2353,"Ġoutside":2354,"Ġanim":2355,"Ġpain":2356,"eter":2357,"Ġmanag":2358,"duct":2359,"ogn":2360,"Ġ]":2361,"ĠSept":2362,"sec":2363,"off":2364,"ĠJan":2365,"Ġfoot":2366,"ades":2367,"Ġthird":2368,"Ġmot":2369,"Ġevidence":2370,"inton":2371,"Ġthreat":2372,"apt":2373,"ples":2374,"cle":2375,"Ġlo":2376,"Ġdecl":2377,"Ġitem":2378,"medi":2379,"Ġrepresent":2380,"omb":2381,"amer":2382,"Ġsignificant":2383,"ograph":2384,"su":2385,"Ġcal":2386,"ires":2387,"0000":2388,"ID":2389,"AM":2390,"Ġsimply":2391,"Ġlonger":2392,"Ġfile":2393,"OT":2394,"che":2395,"So":2396,"ateg":2397,"org":2398,"ĠHis":2399,"Ġener":2400,"Ġdom":2401,"Ġupon":2402,"ili":2403,"\":\"":2404,"Ġthemselves":2405,"Ġcoming":2406,"Ġquite":2407,"Ġdifficult":2408,"ĠBar":2409,"ilities":2410,"rel":2411,"ends":2412,"cial":2413,"64":2414,"Ġwoman":2415,"rap":2416,"yr":2417,"Ġnecess":2418,"ips":2419,"Ġtext":2420,"Ġrequire":2421,"Ġmilitary":2422,"Ġreview":2423,"Ġrespons":2424,"75":2425,"Ġsubject":2426,"Ġinstead":2427,"Ġissues":2428,"Ġgen":2429,"\",\"":2430,"Ġminutes":2431,"Ġweap":2432,"ray":2433,"amed":2434,"time":2435,"bl":2436,"How":2437,"Ġcode":2438,"ĠSm":2439,"Ġhigher":2440,"ĠSte":2441,"ris":2442,"Ġpage":2443,"Ġstudents":2444,"ĠIntern":2445,"Ġmethod":2446,"ĠAug":2447,"ĠPer":2448,"ĠAg":2449,"Ġpolicy":2450,"ĠSw":2451,"Ġexec":2452,"Ġaccept":2453,"ume":2454,"ribut":2455,"Ġwords":2456,"Ġfinal":2457,"Ġchanges":2458,"ĠDemocr":2459,"Ġfriends":2460,"Ġrespect":2461,"Ġep":2462,"Ġcompan":2463,"ivil":2464,"Ġdamage":2465,"****":2466,"ogle":2467,"vironment":2468,"Ġneg":2469,"ental":2470,"Ġap":2471,"Ġtotal":2472,"ival":2473,"!\"":2474,"lim":2475,"Ġneeds":2476,"Ġagre":2477,"Ġdevelopment":2478,"Ġage":2479,"iple":2480,"21":2481,"Ġresults":2482,"ĠAf":2483,"Sh":2484,"Ġgun":2485,"ĠObama":2486,"roll":2487,"Ġ@":2488,"Ġrights":2489,"ĠBrit":2490,"Ġrunning":2491,"Ġwasn":2492,"Ġport":2493,"Ġrate":2494,"Ġpretty":2495,"Ġtarget":2496,"Ġsaw":2497,"Ġcirc":2498,"Ġworks":2499,"icro":2500,"alt":2501,"over":2502,"www":2503,"That":2504,"lier":2505,"Ġeveryone":2506,"ude":2507,"Ġpie":2508,"iddle":2509,"rael":2510,"Ġrad":2511,"Ġblock":2512,"Ġwalk":2513,"To":2514,"ãģ":2515,"nes":2516,"ĠAust":2517,"aul":2518,"rote":2519,"ĠSouth":2520,"ession":2521,"oph":2522,"Ġshows":2523,"Ġsite":2524,"Ġjo":2525,"Ġrisk":2526,"clus":2527,"lt":2528,"Ġinj":2529,"iding":2530,"ĠSpe":2531,"Ġchall":2532,"irm":2533,"Ġ22":2534,"itting":2535,"str":2536,"Ġhy":2537,"LE":2538,"key":2539,"Ġbegan":2540,"atur":2541,"ashington":2542,"lam":2543,"ĠDav":2544,"bit":2545,"Ġsize":2546,"ĠPar":2547,"38":2548,"ournal":2549,"face":2550,"Ġdecision":2551,"Ġlarg":2552,"Ġjud":2553,"rect":2554,"Ġcontinue":2555,"ĠOct":2556,"overed":2557,"ĠInt":2558,"========":2559,"Ġparent":2560,"ĠWill":2561,"Ġeasy":2562,"Ġdrug":2563,"anger":2564,"Ġsense":2565,"Ġdi":2566,"iday":2567,"Ġenergy":2568,"istic":2569,"Ġassoci":2570,"arter":2571,"obal":2572,"eks":2573,"ĠEl":2574,"urch":2575,"Ġgirl":2576,"oe":2577,"itle":2578,"Ġ28":2579,"ĠChe":2580,"Ġrequest":2581,"Ġsoon":2582,"Ġhost":2583,"ky":2584,"Ġstates":2585,"omes":2586,"Ġmaterial":2587,"lex":2588,"Ġmoment":2589,"Ġansw":2590,"onse":2591,"Ġespecially":2592,"Ġnorm":2593,"Ġservices":2594,"pite":2595,"ran":2596,"Ġrole":2597,"44":2598,"):":2599,"Ġcred":2600,"Cl":2601,"________":2602,"Ġmat":2603,"Ġlog":2604,"ĠClinton":2605,"OU":2606,"Ġoffice":2607,"Ġ26":2608,"Ġcharg":2609,"Ġtrack":2610,"ma":2611,"Ġheart":2612,"Ġball":2613,"Ġpersonal":2614,"Ġbuilding":2615,"na":2616,"set":2617,"body":2618,"ĠBlack":2619,"Ġincrease":2620,"itten":2621,"Ġneeded":2622,"36":2623,"32":2624,"=\"":2625,"Ġlost":2626,"Ġbecame":2627,"Ġgroups":2628,"ĠMus":2629,"Ġwrote":2630,"ĠPe":2631,"Ġprop":2632,"joy":2633,"é":2634,"ĠWhite":2635,"Ġdead":2636,".'":2637,"Ġhttp":2638,"Ġwebs":2639,"OS":2640,"Ġinside":2641,"Ġwrong":2642,"Ġstatement":2643,"Ġ...":2644,"yl":2645,"Ġfilm":2646,"Ġmusic":2647,"Ġshare":2648,"ification":2649,"Ġrelease":2650,"Ġforward":2651,"Ġstay":2652,"Ġcomput":2653,"itte":2654,"ser":2655,"Ġoriginal":2656,"Ġcard":2657,"Ġcand":2658,"Ġdiv":2659,"atural":2660,"Ġfavor":2661,"OM":2662,"Ġcases":2663,"uses":2664,"Ġsection":2665,"Ġleave":2666,"ging":2667,"oved":2668,"ĠWashington":2669,"39":2670,"ĠGl":2671,"Ġrequired":2672,"action":2673,"apan":2674,"oor":2675,"iter":2676,"ĠKing":2677,"Ġcountries":2678,"ĠGerman":2679,"lling":2680,"Ġ27":2681,"34":2682,"Ġquestions":2683,"Ġprim":2684,"Ġcell":2685,"Ġshoot":2686,"Ġanyone":2687,"ĠWest":2688,"Ġaffect":2689,"epend":2690,"Ġonline":2691,"ĠIsrael":2692,"ĠSeptember":2693,"Ġability":2694,"Ġcontent":2695,"ises":2696,"Ġreve":2697,"Ġlaun":2698,"Ġindic":2699,"Ġforce":2700,"cast":2701,"Ġsold":2702,"aving":2703,"fl":2704,"Ġsoft":2705,"Ġcompanies":2706,"ceed":2707,"Ġarticle":2708,"Ġaud":2709,"Ġrev":2710,"Ġeduc":2711,"Ġplaying":2712,"05":2713,"Ġheld":2714,"ctor":2715,"Ġreleased":2716,"Ġfederal":2717,"37":2718,"Ġadminist":2719,"Ġinterview":2720,"Ġinstall":2721,"Ġreceived":2722,"Ġsource":2723,"uk":2724,"Ph":2725,"Ġserious":2726,"Ġcreated":2727,"Ġcause":2728,"Ġimmedi":2729,"Ġdefin":2730,"uel":2731,"ĠDepartment":2732,"ctions":2733,"ĠCour":2734,"ĠNow":2735,"ze":2736,"ites":2737,"itution":2738,"Ġlate":2739,"Ġspeak":2740,"ners":2741,"Ġlegal":2742,"ari":2743,"ĠCor":2744,"Ġweeks":2745,"Ġmodel":2746,"Ġpred":2747,"Ġexact":2748,"BC":2749,"ĠBy":2750,"ING":2751,"osing":2752,"Ġtakes":2753,"Ġregard":2754,"Ġopportun":2755,"Ġprice":2756,"Ġ198":2757,"ĠApr":2758,"fully":2759,"Ġord":2760,"Ġproblems":2761,"ruction":2762,"ham":2763,"ĠCount":2764,"lege":2765,"Ġleaders":2766,"ET":2767,"lev":2768,"Ġdeep":2769,"ological":2770,"ese":2771,"haps":2772,"ĠSome":2773,"Ġpers":2774,"Ġcontract":2775,"Ġrelationship":2776,"sp":2777,"oud":2778,"Ġbase":2779,"48":2780,"mit":2781,"Ad":2782,"ancial":2783,"Ġconsum":2784,"Ġpotential":2785,"Ġlangu":2786,"rem":2787,"eth":2788,"Ġrelig":2789,"ressed":2790,"66":2791,"Ġlink":2792,"Ġlower":2793,"ayer":2794,"ĠJune":2795,"Ġfem":2796,"unt":2797,"erc":2798,"urd":2799,"Ġcontact":2800,"Ġill":2801,"Ġmother":2802,"Ġestab":2803,"htt":2804,"ĠMarch":2805,"ĠBro":2806,"ĠChina":2807,"Ġ29":2808,"Ġsqu":2809,"Ġprovided":2810,"Ġaverage":2811,"asons":2812,"Ġ2011":2813,"Ġexam":2814,"lin":2815,"55":2816,"ned":2817,"Ġperfect":2818,"Ġtou":2819,"alse":2820,"ux":2821,"Ġbuy":2822,"Ġshot":2823,"Ġcollect":2824,"Ġphot":2825,"Ġplayed":2826,"Ġsurpr":2827,"Ġofficials":2828,"Ġsimple":2829,"avy":2830,"Ġindustry":2831,"Ġhands":2832,"ground":2833,"Ġpull":2834,"Ġround":2835,"Ġuser":2836,"Ġrange":2837,"uary":2838,"Ġprivate":2839,"ops":2840,"ees":2841,"Ġways":2842,"ĠMich":2843,"Ġveh":2844,"Ġexcept":2845,"Ġterms":2846,"imum":2847,"pper":2848,"ION":2849,"ores":2850,"ĠDragon":2851,"oul":2852,"Ġden":2853,"Ġperformance":2854,"Ġbill":2855,"cil":2856,"47":2857,"Ġenvironment":2858,"Ġexc":2859,"add":2860,"Ġworth":2861,"Ġpict":2862,"Ġchance":2863,"Ġ2018":2864,"bor":2865,"Ġspeed":2866,"iction":2867,"Ġalleg":2868,"ĠJapan":2869,"atory":2870,"reet":2871,"Ġmatch":2872,"ĠII":2873,"Ġstru":2874,"order":2875,"Ġste":2876,"Ġliving":2877,"Ġstruct":2878,"ino":2879,"Ġsepar":2880,"hern":2881,"Ġresponse":2882,"Ġenjoy":2883,"Ġvia":2884,"AD":2885,"uments":2886,"acebook":2887,"Ġmember":2888,"ibr":2889,"izing":2890,"Ġtool":2891,"ĠMon":2892,"ĠWhile":2893,"hood":2894,"ĠAng":2895,"ĠDef":2896,"Ġoffer":2897,"Tr":2898,"aur":2899,"Ġturned":2900,"ĠJuly":2901,"down":2902,"anced":2903,"Ġrecently":2904,"ĠEar":2905,"Ġce":2906,"ĠStar":2907,"ĠCong":2908,"rought":2909,"Ġblood":2910,"Ġhope":2911,"Ġcomment":2912,"aint":2913,"Ġarri":2914,"iles":2915,"Ġparticip":2916,"ought":2917,"ription":2918,"08":2919,"49":2920,"Ġgave":2921,"Ġselect":2922,"Ġkilled":2923,"sych":2924,"Ġgoes":2925,"ij":2926,"Ġcoll":2927,"Ġimpact":2928,"atives":2929,"ĠSer":2930,"09":2931,"ĠAugust":2932,"Ġboy":2933,"de":2934,"ĠDes":2935,"Ġfelt":2936,"US":2937,"Ġexpected":2938,"Ġimage":2939,"ĠMark":2940,"ccording":2941,"oice":2942,"EC":2943,"ĠMag":2944,"ened":2945,"hold":2946,"ĠPost":2947,"Ġprevent":2948,"No":2949,"Ġinvolved":2950,"Ġeyes":2951,"Ġquickly":2952,"At":2953,"unk":2954,"Ġbehav":2955,"Ġur":2956,"Ġled":2957,"come":2958,"ey":2959,"Ġcandid":2960,"Ġearlier":2961,"Ġfocus":2962,"ety":2963,"Pro":2964,"ledge":2965,"ixed":2966,"illed":2967,"Ġpopular":2968,"AP":2969,"Ġsett":2970,"light":2971,"Ġvarious":2972,"inks":2973,"Ġlevels":2974,"Ġroad":2975,"ellig":2976,"ables":2977,"hel":2978,"ittee":2979,"ĠGener":2980,"ype":2981,"Ġheard":2982,"icles":2983,"Ġmis":2984,"Ġusers":2985,"ĠSan":2986,"Ġimprove":2987,"Ġfather":2988,"Ġsearch":2989,"They":2990,"vil":2991,"Ġprofess":2992,"Ġknew":2993,"Ġloss":2994,"Ġevents":2995,"65":2996,"Ġbillion":2997,"07":2998,"02":2999,"ĠNews":3000,"ĠAM":3001,"Ġcover":3002,"where":3003,"ension":3004,"Ġbott":3005,"Ġareas":3006,"ences":3007,"ope":3008,"ĠTwitter":3009,"ael":3010,"Ġgets":3011,"ĠGoogle":3012,"Ġsn":3013,"iant":3014,"Ġvote":3015,"Ġnearly":3016,"Ġincluded":3017,"Ġrecogn":3018,"zz":3019,"mm":3020,"aled":3021,"Ġhappened":3022,"04":3023,"Ġhot":3024,"Ġwhose":3025,"Ġcivil":3026,"Ġsuff":3027,"oes":3028,"itiz":3029,"ĠSyri":3030,"Ġrespond":3031,"Ġhon":3032,"Ġfeatures":3033,"Ġeconomic":3034,"ĠApril":3035,"rim":3036,"Ġtechnology":3037,"Ġoption":3038,"aging":3039,"Ġpurch":3040,"Re":3041,"Ġlat":3042,"chie":3043,"isl":3044,"Ġrecomm":3045,"uf":3046,"Ġtraining":3047,"Ġeffects":3048,"Ġfast":3049,"Ġ2010":3050,"Ġoccur":3051,"Ġwebsite":3052,"Ġemail":3053,"Ġsens":3054,"ech":3055,"Ġoil":3056,"Ġinflu":3057,"Ġcurrently":3058,"ĠSch":3059,"ĠAdd":3060,"Ġgoal":3061,"Ġscient":3062,"Ġconv":3063,"100":3064,"emy":3065,"Ġdecided":3066,"Ġtravel":3067,"Ġmention":3068,"LL":3069,"03":3070,"Ġelection":3071,"Ġphone":3072,"Ġlooks":3073,"Ġsituation":3074,"Ġcy":3075,"Ġhor":3076,"bed":3077,"ĠCourt":3078,"aily":3079,"aves":3080,"Ġquality":3081,"ĠComp":3082,"wise":3083,"Ġtable":3084,"Ġstaff":3085,"ĠWind":3086,"ett":3087,"Ġtried":3088,"idered":3089,"Ġaddition":3090,"Ġbox":3091,"Ġlack":3092,"arily":3093,"Ġwide":3094,"Ġmid":3095,"Ġboard":3096,"ysis":3097,"Ġanti":3098,"ha":3099,"Ġdig":3100,"ening":3101,"Ġdro":3102,"Con":3103,"68":3104,"Ġslow":3105,"based":3106,"sequ":3107,"Ġpath":3108,"Ex":3109,"aker":3110,"Ġworked":3111,"Ġpen":3112,"Ġengine":3113,"Ġlooked":3114,"ĠSuper":3115,"ĠServ":3116,"Ġvictim":3117,"Un":3118,"Ġproperty":3119,"Ġintrodu":3120,"Ġexecut":3121,"ĠPM":3122,"Le":3123,"Ġcolor":3124,"ĠMore":3125,"Ġ60":3126,"Ġnetwork":3127,"Ġdate":3128,"cul":3129,"idge":3130,"Ġextra":3131,"31":3132,"Ġsle":3133,"67":3134,"Ġwond":3135,"Ġreports":3136,"just":3137,"ĠAustral":3138,"Ġcapital":3139,"Ġens":3140,"Ġcommand":3141,"Ġallowed":3142,"Ġprep":3143,"Ġcapt":3144,"hib":3145,"Ġnumbers":3146,"chan":3147,"Ġfair":3148,"mp":3149,"oms":3150,"Ġreach":3151,"With":3152,"tain":3153,"Ġbroad":3154,"Ġcouple":3155,"ecause":3156,"lying":3157,"ĠFeb":3158,"Ġscreen":3159,"Ġlives":3160,"Ġprior":3161,"ĠCongress":3162,"Ar":3163,"Ġapproach":3164,"Ġemer":3165,"aries":3166,"ĠDis":3167,"serv":3168,"ĠNe":3169,"Ġbuilt":3170,"cies":3171,"Ġrepe":3172,"Ġrules":3173,"force":3174,"ĠPal":3175,"Ġfinancial":3176,"Ġconsidered":3177,"ĠChar":3178,"nces":3179,"ĠIS":3180,"Ġbrought":3181,"Ġbi":3182,"iers":3183,"ĠSim":3184,"OP":3185,"Ġproducts":3186,"Ġvisit":3187,"Ġdocument":3188,"Ġconduct":3189,"Ġcompletely":3190,"ining":3191,"ĠCalif":3192,"ibly":3193,"Ġwritten":3194,"ĠTV":3195,"ements":3196,"Ġdraw":3197,"One":3198,"Ġpublished":3199,"Ġsecret":3200,"rain":3201,"het":3202,"ĠFacebook":3203,"onday":3204,"ĠUp":3205,"Ġsexual":3206,"Ġthous":3207,"ĠPat":3208,"Ġess":3209,"Ġstandard":3210,"Ġarm":3211,"ges":3212,"ection":3213,"Ġfell":3214,"Ġforeign":3215,"ani":3216,"ĠFriday":3217,"Ġregular":3218,"inary":3219,"Ġincreased":3220,"Ġusually":3221,"Ġdemon":3222,"Ġdark":3223,"Ġadditional":3224,"rol":3225,"ĠOf":3226,"Ġproduction":3227,"!!":3228,"undred":3229,"Ġinternational":3230,"idents":3231,"ĠFree":3232,"roup":3233,"Ġrace":3234,"Ġmach":3235,"Ġhuge":3236,"All":3237,"lear":3238,"ovember":3239,"Ġtown":3240,"Ġattention":3241,"ĠOff":3242,"yond":3243,"ĠThen":3244,"field":3245,"Ġterror":3246,"raz":3247,"ĠBo":3248,"Ġmeeting":3249,"ĠPark":3250,"Ġarrest":3251,"Ġfear":3252,"Ġaw":3253,"ĠVal":3254,"oring":3255,"',":3256,"Ġextreme":3257,"arr":3258,"Ġworkers":3259,"After":3260,"Ġ31":3261,"net":3262,"ament":3263,"Ġdirectly":3264,"Ġpopulation":3265,"ube":3266,"ĠOctober":3267,"ĠIN":3268,"ĠJanuary":3269,"59":3270,"ĠDavid":3271,"Ġcross":3272,"cember":3273,"ĠFirst":3274,"Ġmessage":3275,"irit":3276,"Ġnation":3277,"Ġpoll":3278,"isions":3279,"Ġanswer":3280,"ny":3281,"isode":3282,"Ġcarry":3283,"ĠRussia":3284,"Ġhear":3285,"ength":3286,"roy":3287,"Ġnatural":3288,"inally":3289,"Ġdog":3290,"mitted":3291,"Ġtrade":3292,"Ġsubst":3293,"Ġmultiple":3294,"ĠAfric":3295,"Ġfans":3296,"Ġsort":3297,"Ġglobal":3298,"ication":3299,"ĠWed":3300,"ara":3301,"Ġachie":3302,"Ġlanguage":3303,"vey":3304,"Ġtal":3305,"Ġnecessary":3306,"Ġdetails":3307,"Ġsen":3308,"ĠSund":3309,"ĠReg":3310,"ĠRec":3311,"06":3312,"Ġsil":3313,"ressive":3314,"Ġmedical":3315,"unch":3316,"ornia":3317,"Ġund":3318,"fort":3319,"ocks":3320,"ĠMonday":3321,"uesday":3322,"craft":3323,"77":3324,"urt":3325,"Ġver":3326,"ĠHill":3327,"Ġreceive":3328,"Ġmorning":3329,"estern":3330,"Ġbank":3331,"Ġsat":3332,"irth":3333,"ĠHigh":3334,"Ġdevice":3335,"ĠTHE":3336,"ĠCenter":3337,"Ġsafe":3338,"Ġple":3339,"ĠCanada":3340,"Ġsystems":3341,"Ġassist":3342,"Ġsurv":3343,"Ġbattle":3344,"ĠSoc":3345,"vertis":3346,"She":3347,"Ġpaper":3348,"Ġgrowth":3349,"Ġcast":3350,"Sc":3351,"Ġplans":3352,"lled":3353,"Ġparts":3354,"Ġwall":3355,"Ġmovement":3356,"Ġpractice":3357,"imately":3358,"Ġdisplay":3359,"Ġsometimes":3360,"omp":3361,"ĠPaul":3362,"ĠYes":3363,"king":3364,"58":3365,"oly":3366,"Ġson":3367,"Ġavoid":3368,"okes":3369,"ĠJew":3370,"Ġtowards":3371,"asc":3372,"Ġ//":3373,"ĠKore":3374,"Ġtalking":3375,"Ġcorrect":3376,"Ġspent":3377,"icks":3378,"iable":3379,"eared":3380,"Ġterm":3381,"Ġwants":3382,"oming":3383,"Ġut":3384,"Ġdoub":3385,"Ġforces":3386,"Ġplease":3387,"69":3388,"ĠNovember":3389,"atform":3390,"ondon":3391,"Ġones":3392,"Ġimmediately":3393,"ĠRussian":3394,"ĠMet":3395,"Ġdeg":3396,"Ġparents":3397,"CH":3398,"ĠAmericans":3399,"aly":3400,"ĠMod":3401,"Ġshown":3402,"Ġconditions":3403,"Ġstuff":3404,"Ġreb":3405,"ĠYour":3406,"Ġincludes":3407,"nown":3408,"ĠSam":3409,"Ġexperien":3410,"mission":3411,"ĠEven":3412,"aught":3413,"Ġannounced":3414,"ĠRepublican":3415,"Ġdetermin":3416,"Ġdescribed":3417,"ĠCounty":3418,"()":3419,"Ġdoor":3420,"Ġchanged":3421,"Ġneigh":3422,"ĠHere":3423,"Ġclean":3424,"Ġpan":3425,"ĠDecember":3426,"ĠEuropean":3427,"iring":3428,"apter":3429,"Ġclub":3430,"ĠTuesday":3431,"Ġpaid":3432,"ĠNet":3433,"Ġattacks":3434,"Ġcharacters":3435,"Ġalone":3436,"Ġdirector":3437,"dom":3438,"Ġ35":3439,"Ġload":3440,"Ġrout":3441,"ĠCalifornia":3442,"Ġfinally":3443,"Ġrac":3444,"Ġcontr":3445,"Ġexactly":3446,"resh":3447,"pri":3448,"ĠIslam":3449,"Ġnature":3450,"Ġcareer":3451,"Ġlatest":3452,"Ġconvers":3453,"ĠSl":3454,"pose":3455,"cient":3456,"ĠInc":3457,"ivity":3458,"88":3459,"ĠAtt":3460,"ĠMor":3461,"nesday":3462,"Ġweight":3463,"ken":3464,"Ġnote":3465,"Ġteams":3466,"Ġ\\":3467,"airs":3468,"ĠGreen":3469,"Ġhundred":3470,"onent":3471,"Ġstreng":3472,"Ġconsist":3473,"icated":3474,"Ġregul":3475,"Ġlic":3476,"astic":3477,"Ġten":3478,"ursday":3479,"elligence":3480,"ously":3481,"ĠUK":3482,"BI":3483,"Ġcosts":3484,"Ġindepend":3485,"ĠAP":3486,"Ġnormal":3487,"Ġhom":3488,"Ġobvious":3489,"Ġswe":3490,"Ġstar":3491,"Ġready":3492,"acher":3493,"Ġimplement":3494,"gest":3495,"Ġsong":3496,"ĠGet":3497,"ĠLab":3498,"Ġinteresting":3499,"using":3500,"Ġgiving":3501,"ĠSunday":3502,"Ġetc":3503,"Ġmiddle":3504,"Ġremember":3505,"right":3506,"osition":3507,"utions":3508,"Ġmax":3509,"46":3510,"Ġyourself":3511,"Ġdemand":3512,"Ġtreatment":3513,"Ġdanger":3514,"ĠCons":3515,"Ġguy":3516,"ĠBritish":3517,"Ġphysical":3518,"Ġrelated":3519,"Ġremain":3520,"Ġcouldn":3521,"Ġrefer":3522,"Ġcitiz":3523,"box":3524,"ENT":3525,"board":3526,"Ġinn":3527,"IG":3528,"ero":3529,"ĠStreet":3530,"ospital":3531,"rench":3532,"chers":3533,"Ġstra":3534,"OL":3535,"ager":3536,"ĠAN":3537,"Ġeasily":3538,"IA":3539,"enge":3540,"iny":3541,"Ġclos":3542,"ocked":3543,"Ġuses":3544,"ĠCoun":3545,"Im":3546,"uild":3547,"??":3548,"more":3549,"Ġang":3550,"Ġwrite":3551,"olute":3552,"57":3553,"Ġleader":3554,"Ġreading":3555,"":3784,"Ġfigure":3785,"Ġdisapp":3786,"enty":3787,"Ġsoftware":3788,"Ġult":3789,"Ġofficers":3790,"New":3791,"Is":3792,"Ġremains":3793,"ĠIndia":3794,"Ġpsych":3795,"rief":3796,"Ġcat":3797,"esc":3798,"Ġobserv":3799,"Ġstage":3800,"ĠDark":3801,"Ġenter":3802,"change":3803,"Ġpassed":3804,"Ġdespite":3805,"ĠOut":3806,"Ġmovie":3807,"rs":3808,"Ġvoice":3809,"mine":3810,"ĠPlay":3811,"Ġtoward":3812,"ĠTer":3813,"Ġregion":3814,"Ġvalues":3815,"orters":3816,"Ġmount":3817,"Ġofficer":3818,"ĠOther":3819,"ban":3820,"Ġhous":3821,"wood":3822,"room":3823,"IV":3824,"ĠSun":3825,"see":3826,"ĠOver":3827,"rog":3828,"90":3829,"Ġlay":3830,"ĠTur":3831,"awn":3832,"Ġpressure":3833,"ĠSub":3834,"Ġbooks":3835,"edom":3836,"ĠSand":3837,"AA":3838,"ago":3839,"Ġreasons":3840,"ford":3841,"Ġactivity":3842,"UT":3843,"Now":3844,"ĠSenate":3845,"cell":3846,"night":3847,"Ġcalls":3848,"inter":3849,"Ġletter":3850,"ĠRob":3851,"ĠJe":3852,"Ġchoose":3853,"ĠLaw":3854,"Get":3855,"Be":3856,"Ġrob":3857,"Ġtypes":3858,"Ġplatform":3859,"Ġquarter":3860,"RA":3861,"ĠTime":3862,"Ġmaybe":3863,"ĠCr":3864,"95":3865,"pre":3866,"Ġmoving":3867,"Ġlif":3868,"Ġgold":3869,"Ġsom":3870,"Ġpatients":3871,"Ġtruth":3872,"ĠKe":3873,"urance":3874,"antly":3875,"mar":3876,"Ġcharge":3877,"ĠGreat":3878,"Ġcele":3879,"--------------------------------":3880,"Ġrock":3881,"roid":3882,"ancy":3883,"Ġcredit":3884,"aud":3885,"By":3886,"ĠEvery":3887,"Ġmoved":3888,"inger":3889,"ribution":3890,"Ġnames":3891,"Ġstraight":3892,"ĠHealth":3893,"ĠWell":3894,"Ġfeature":3895,"Ġrule":3896,"Ġsche":3897,"inated":3898,"ĠMichael":3899,"berg":3900,"41":3901,"iled":3902,"band":3903,"Ġclick":3904,"ĠAngel":3905,"onents":3906,"ÂŃ":3907,"ĠIraq":3908,"ĠSaturday":3909,"Ġaware":3910,"part":3911,"Ġpattern":3912,"OW":3913,"ĠLet":3914,"Ġgrad":3915,"igned":3916,"Ġassociated":3917,"Ġstyle":3918,"no":3919,"iation":3920,"aith":3921,"ilies":3922,"Ġstories":3923,"uration":3924,"Ġindividuals":3925,"ĠâĢ¦":3926,"miss":3927,"ĠAssoci":3928,"ishing":3929,"aby":3930,"Ġsummer":3931,"ĠBen":3932,"Ġ32":3933,"Ġarch":3934,"uty":3935,"ĠTexas":3936,"hol":3937,"Ġfully":3938,"Ġmill":3939,"Ġfollowed":3940,"ĠBill":3941,"ĠIndian":3942,"ĠSecret":3943,"ĠBel":3944,"ĠFebruary":3945,"Ġjobs":3946,"Ġseemed":3947,"ĠGovern":3948,"ipped":3949,"Ġreality":3950,"Ġlines":3951,"Ġpark":3952,"Ġmeasure":3953,"ĠOur":3954,"IM":3955,"Ġbrother":3956,"Ġgrowing":3957,"Ġban":3958,"Ġestim":3959,"Ġcry":3960,"ĠSchool":3961,"Ġmechan":3962,"ĠOF":3963,"ĠWindows":3964,"Ġrates":3965,"ĠOh":3966,"Ġpositive":3967,"Ġculture":3968,"istics":3969,"ica":3970,"Ġhar":3971,"ya":3972,"itely":3973,"ipp":3974,"Ġmap":3975,"encies":3976,"ĠWilliam":3977,"II":3978,"akers":3979,"56":3980,"ĠMart":3981,"ĠRem":3982,"Ġaltern":3983,"itude":3984,"Ġcoach":3985,"rowd":3986,"Don":3987,"Ġkids":3988,"Ġjournal":3989,"Ġcorpor":3990,"Ġfalse":3991,"Ġweb":3992,"Ġsleep":3993,"Ġcontain":3994,"Ġsto":3995,"Ġbed":3996,"iverse":3997,"ĠRich":3998,"ĠChinese":3999,"Ġpun":4000,"Ġmeant":4001,"known":4002,"Ġnotice":4003,"Ġfavorite":4004,"aven":4005,"Ġcondition":4006,"Ġpurpose":4007,"))":4008,"Ġorganization":4009,"Ġchalleng":4010,"Ġmanufact":4011,"Ġsusp":4012,"ĠAc":4013,"Ġcritic":4014,"unes":4015,"uclear":4016,"Ġmer":4017,"vention":4018,"Ġ80":4019,"Ġmist":4020,"ĠUs":4021,"ĠTor":4022,"http":4023,"olf":4024,"Ġlarger":4025,"Ġadvant":4026,"Ġresear":4027,"Ġactions":4028,"ml":4029,"Ġkept":4030,"Ġaim":4031,",'":4032,"col":4033,"Ġbenefits":4034,"ifying":4035,"Ġactual":4036,"ĠInternational":4037,"Ġvehicle":4038,"Ġchief":4039,"Ġefforts":4040,"ĠLeague":4041,"ĠMost":4042,"Ġwait":4043,"Ġadult":4044,"Ġoverall":4045,"Ġspeech":4046,"Ġhighly":4047,"Ġfemale":4048,"Ġerror":4049,"Ġeffective":4050,"54":4051,"Ġencour":4052,"well":4053,"Ġfailed":4054,"Ġconserv":4055,"Ġprograms":4056,"Ġtrou":4057,"Ġahead":4058,"500":4059,"vertisement":4060,"IP":4061,"ĠFound":4062,"pir":4063,"Ġ%":4064,"Ġcrime":4065,"ander":4066,"Ġlocation":4067,"ĠIran":4068,"Ġbehavior":4069,"azing":4070,"Ġrare":4071,"Ġemb":4072,"Ġcaused":4073,"Ġship":4074,"Ġactive":4075,"Ġcontribut":4076,"Ġgreen":4077,"Ġacqu":4078,"Ġreflect":4079,"venue":4080,"Ġfirm":4081,"Ġbirth":4082,"].":4083,"Ġclearly":4084,"Ġemot":4085,"Ġagency":4086,"riage":4087,"Ġmemory":4088,"98":4089,"SA":4090,"ĠSee":4091,"acing":4092,"CC":4093,"Ġbiggest":4094,"Ġrap":4095,"Ġbasic":4096,"Ġband":4097,"eat":4098,"Ġsuspect":4099,"ĠMac":4100,"Ġ90":4101,"mark":4102,"istan":4103,"Ġspread":4104,"ams":4105,"ki":4106,"asy":4107,"rav":4108,"ĠRober":4109,"Ġdemonstr":4110,"rated":4111,"Ġabsolute":4112,"Ġplaces":4113,"Ġimpl":4114,"ibrary":4115,"Ġcards":4116,"Ġdestroy":4117,"Ġvirt":4118,"vere":4119,"Ġappeared":4120,"yan":4121,"point":4122,"Ġbeg":4123,"Ġtemper":4124,"spe":4125,"anted":4126,"ears":4127,"ĠDirect":4128,"Ġlength":4129,"Ġblog":4130,"amb":4131,"Ġinteg":4132,"Ġresources":4133,"acc":4134,"iful":4135,"Ġspot":4136,"Ġforced":4137,"Ġthousands":4138,"ĠMinister":4139,"Ġqual":4140,"ĠFrench":4141,"atically":4142,"Ġgenerally":4143,"Ġdrink":4144,"Ġthus":4145,"IL":4146,"odes":4147,"Ġappropri":4148,"ĠRead":4149,"Ġwhom":4150,"Ġeye":4151,"Ġcollege":4152,"Ġ45":4153,"irection":4154,"Ġensure":4155,"Ġapparent":4156,"iders":4157,"Ġreligious":4158,"Ġminor":4159,"olic":4160,"Ġtro":4161,"ĠWhy":4162,"ribute":4163,"met":4164,"Ġprimary":4165,"Ġdeveloped":4166,"Ġpeace":4167,"Ġskin":4168,"ste":4169,"ava":4170,"Ġblue":4171,"Ġfamilies":4172,"Ġir":4173,"Ġapply":4174,"Ġinform":4175,"ĠSmith":4176,"CT":4177,"ii":4178,"Ġlimit":4179,"Ġresist":4180,"................":4181,"umn":4182,"Ġconflic":4183,"Ġtwe":4184,"udd":4185,"ĠTom":4186,"Ġliter":4187,"que":4188,"bon":4189,"Ġhair":4190,"Ġeventually":4191,"Ġpus":4192,"Ġhelped":4193,"Ġagg":4194,"orney":4195,"ĠApple":4196,"Ġfit":4197,"ĠSur":4198,"Ġprem":4199,"Ġsales":4200,"Ġseconds":4201,"Ġstrength":4202,"Ġfeeling":4203,"¿½":4204,"Ġtour":4205,"Ġknows":4206,"oom":4207,"Ġexerc":4208,"Ġsomew":4209,"�":4210,">>":4211,"Ġspokes":4212,"Ġideas":4213,"Ġregist":4214,"soft":4215,"ĠDel":4216,"ĠPC":4217,"Ġpropos":4218,"Ġlaunch":4219,"Ġbottom":4220,"TH":4221,"ĠPlease":4222,"vest":4223,"itz":4224,"ĠInter":4225,"Ġscript":4226,"Ġrat":4227,"arning":4228,"Ġil":4229,"ĠJer":4230,"ĠAre":4231,"Ġwhatever":4232,"oken":4233,"cience":4234,"Ġmode":4235,"Ġagree":4236,"Ġsources":4237,"Ġinitial":4238,"Ġrestrict":4239,"Ġwonder":4240,"usion":4241,"####":4242,"ĠSil":4243,"ville":4244,"Ġburn":4245,"tw":4246,"asion":4247,"Ġ£":4248,"Ġnor":4249,"uing":4250,"Ġreached":4251,"Ġsun":4252,"Ġcateg":4253,"igration":4254,"Ġcook":4255,"Ġpromot":4256,"Ġmale":4257,"Ġclimate":4258,"Ġfix":4259,"Ġalleged":4260,"UR":4261,"alled":4262,"Ġimages":4263,"Cont":4264,"ota":4265,"Ġschools":4266,"ios":4267,"Ġdrop":4268,"Ġstream":4269,"ĠMo":4270,"Ġpreviously":4271,"aling":4272,"Ġpet":4273,"Ġdouble":4274,"Ġ(@":4275,"annel":4276,"Ġdefault":4277,"ties":4278,"Ġrank":4279,"ĠDec":4280,"ĠCouncil":4281,"Ġweapon":4282,"Ġstock":4283,"Ġanaly":4284,"ĠStr":4285,"Ġpicture":4286,"ĠPolice":4287,"ference":4288,"Ġcentury":4289,"Ġcitizens":4290,"Ġonto":4291,"Ġexpand":4292,"Ġhero":4293,"ĠSol":4294,"Ġwild":4295,"Ġupdate":4296,"Ġcustomers":4297,"ront":4298,"def":4299,"Ġlik":4300,"Ġcriminal":4301,"ĠChristian":4302,"SP":4303,"76":4304,"Ġleaving":4305,"Ġotherwise":4306,"ĠDist":4307,"Ġbasis":4308,"52":4309,"53":4310,"icip":4311,"ĠBer":4312,"Ġrecommend":4313,"Ġfloor":4314,"Ġcrowd":4315,"oles":4316,"Ġ70":4317,"Ġcentral":4318,"ĠEv":4319,"Ġdream":4320,"Ġdownload":4321,"Ġconfir":4322,"ĠThom":4323,"Ġwindow":4324,"Ġhappens":4325,"Ġunit":4326,"Ġtend":4327,"Ġspl":4328,"Ġbecomes":4329,"Ġfighting":4330,"Ġpredict":4331,"ĠPress":4332,"ĠPower":4333,"Ġheavy":4334,"aked":4335,"Ġfan":4336,"orter":4337,"ategy":4338,"BA":4339,"izes":4340,"Ġspend":4341,"Here":4342,"Ġ2007":4343,"Ġadop":4344,"ĠHam":4345,"Ġfootball":4346,"ĠPort":4347,"oday":4348,"51":4349,"ampions":4350,"Ġtransfer":4351,"ht":4352,"Ġ38":4353,"term":4354,"acity":4355,"Ġbur":4356,"],":4357,"ternal":4358,"rig":4359,"but":4360,"Ġtherefore":4361,"ĠBecause":4362,"resp":4363,"rey":4364,"Ġmission":4365,"Some":4366,"Ġnoted":4367,"Ġassum":4368,"Ġdisease":4369,"Ġedit":4370,"Ġprogress":4371,"rd":4372,"ĠBrown":4373,"ocal":4374,"Ġadding":4375,"Ġraised":4376,"ĠAny":4377,"Ġtick":4378,"Ġseeing":4379,"ĠPeople":4380,"Ġagreement":4381,"Ġserver":4382,"Ġwat":4383,"Ġdebate":4384,"Ġsupposed":4385,"iling":4386,"Ġlargest":4387,"Ġsuccessful":4388,"ĠPri":4389,"ĠDemocratic":4390,"Ġjump":4391,"ĠSyria":4392,"Ġowners":4393,"Ġoffers":4394,"Ġshooting":4395,"Ġeffic":4396,"sey":4397,"Ġhaven":4398,"verse":4399,"tered":4400,"ĠLight":4401,"imal":4402,"ĠBig":4403,"Ġdefend":4404,"Ġbeat":4405,"Ġrecords":4406,"%)":4407,"Ġscen":4408,"Ġemployees":4409,"Ġdevices":4410,"hem":4411,"Ġcommer":4412,"ĠMex":4413,"Ġbenefit":4414,"ĠProf":4415,"Ġilleg":4416,"Ġsurface":4417,"ĠAlso":4418,"Ġharm":4419,"ingly":4420,"wide":4421,"ĠAlex":4422,"Ġshut":4423,"ĠCur":4424,"Ġlose":4425,"pm":4426,"Ġchallenge":4427,"semb":4428,"Ġstation":4429,"Ġintelligence":4430,"Ġaccur":4431,"ĠFlor":4432,"Ġrequires":4433,"ĠMal":4434,"bum":4435,"Ġhospital":4436,"Ġspirit":4437,"Ġoffered":4438,"Ġproduce":4439,"ĠCommun":4440,"Ġcreating":4441,"Ġcris":4442,"spect":4443,"Ġended":4444,"Ġdaily":4445,"Ġvoters":4446,"lands":4447,"ias":4448,"ih":4449,"ona":4450,"Ġsmart":4451,"ĠOffice":4452,"ĠLord":4453,"rial":4454,"ĠInternet":4455,"Ġcircum":4456,"Ġextremely":4457,"'.":4458,"Ġopinion":4459,"ĠMil":4460,"Ġgain":4461,"BS":4462,"ĠFin":4463,"yp":4464,"Ġuseful":4465,"Ġbudget":4466,"Ġcomfort":4467,"isf":4468,"Ġbackground":4469,"eline":4470,"Ġepisode":4471,"Ġenemy":4472,"Ġtrial":4473,"Ġestablish":4474,"date":4475,"ĠCap":4476,"Ġcontinues":4477,"Ġshowing":4478,"ĠUnion":4479,"with":4480,"Ġposted":4481,"ĠSystem":4482,"Ġeat":4483,"rian":4484,"Ġrise":4485,"ĠGermany":4486,"ils":4487,"Ġsigned":4488,"Ġvill":4489,"Ġgrand":4490,"mor":4491,"ĠEngland":4492,"Ġprojects":4493,"umber":4494,"Ġconference":4495,"za":4496,"Ġresponsible":4497,"ĠArab":4498,"Ġlearned":4499,"âĢĶâĢĶ":4500,"ipping":4501,"ĠGeorge":4502,"OC":4503,"Ġreturned":4504,"ĠAustralia":4505,"Ġbrief":4506,"Qu":4507,"Ġbrand":4508,"illing":4509,"abled":4510,"Ġhighest":4511,"Ġtrain":4512,"ĠCommission":4513,"while":4514,"Ġnom":4515,"ception":4516,"Ġmut":4517,"ĠBlue":4518,"Ġincident":4519,"vant":4520,"86":4521,"ĠID":4522,"Ġnuclear":4523,"74":4524,"ĠLike":4525,"ĠRE":4526,"ĠMicro":4527,"li":4528,"mail":4529,"Ġcharges":4530,"89":4531,"Ġadjust":4532,"ado":4533,"Ġearth":4534,"NA":4535,"Ġprices":4536,"PA":4537,"Ġdraft":4538,"Ġruns":4539,"Ġcandidate":4540,"enses":4541,"Ġmanagement":4542,"ĠPhil":4543,"ĠMiss":4544,"Ġteach":4545,"gram":4546,"Ġunderstanding":4547,"ait":4548,"icago":4549,"Add":4550,"ĠEp":4551,"secut":4552,"Ġseparate":4553,"Ġinstance":4554,"Ġeth":4555,"Ġunless":4556,"********":4557,"ĠFore":4558,"inate":4559,"Ġoperations":4560,"Sp":4561,"Ġfaith":4562,"gar":4563,"ĠChurch":4564,"ronic":4565,"Ġconfig":4566,"osure":4567,"Ġactivities":4568,"Ġtraditional":4569,"Ġ36":4570,"Ġdirection":4571,"Ġmachine":4572,"Ġsurround":4573,"Ġpush":4574,"unction":4575,"ĠEU":4576,"Ġeasier":4577,"Ġargument":4578,"GB":4579,"Ġmicro":4580,"Ġspending":4581,"izations":4582,"Ġtheory":4583,"adow":4584,"Ġcalling":4585,"ĠLast":4586,"Ġder":4587,"Ġinfluence":4588,"Ġcommit":4589,"Ġphoto":4590,"Ġunc":4591,"istry":4592,"gn":4593,"aste":4594,"acks":4595,"Ġdisp":4596,"ady":4597,"do":4598,"ĠGood":4599,"Ġ`":4600,"Ġwish":4601,"Ġrevealed":4602,"³³":4603,"lig":4604,"Ġenforce":4605,"ĠCommittee":4606,"Ġchem":4607,"Ġmiles":4608,"Ġinterested":4609,"Ġsolution":4610,"icy":4611,"inct":4612,"Ġ->":4613,"ĠDet":4614,"Ġremoved":4615,"Ġcompar":4616,"eah":4617,"Ġplant":4618,"ĠSince":4619,"Ġachieve":4620,"Ġadvantage":4621,"Ġslightly":4622,"bing":4623,"Ġplaced":4624,"under":4625,"2015":4626,"ĠMad":4627,"Ġtim":4628,"oses":4629,"Ġcru":4630,"ĠRock":4631,"Ġmostly":4632,"Ġnegative":4633,"Ġsetting":4634,"Ġproduced":4635,"Ġmur":4636,"Ġconnection":4637,"ĠMer":4638,"Ġdriver":4639,"Ġexecutive":4640,"Ġassault":4641,"Ġborn":4642,"ĠVer":4643,"tained":4644,"Ġstructure":4645,"Ġreduce":4646,"Ġdecades":4647,"Ġded":4648,"uke":4649,"ĠMany":4650,"idden":4651,"Ġleague":4652,"Se":4653,"Ġjoin":4654,"Ġdisco":4655,"Ġdie":4656,"cks":4657,"actions":4658,"Ġassess":4659,"agn":4660,"Ġgoals":4661,"ours":4662,"IR":4663,"Ġsenior":4664,"iller":4665,"mod":4666,"ipment":4667,"ocol":4668,"uy":4669,"ĠQue":4670,"Ġparties":4671,"irgin":4672,"Ġlearning":4673,"itable":4674,"Ġstreet":4675,"Ġcamera":4676,"App":4677,"Ġskills":4678,"bre":4679,"cious":4680,"Ġcelebr":4681,"ĠFranc":4682,"Ġexisting":4683,"Ġwilling":4684,"lor":4685,"Ġid":4686,"ĠSpace":4687,"Ġcritical":4688,"ĠLa":4689,"ortunately":4690,"Ġserve":4691,"Ġcold":4692,"Ġspecies":4693,"TS":4694,"Ġanimals":4695,"ĠBay":4696,"Ġolder":4697,"ĠUnder":4698,"estic":4699,"ĠTre":4700,"Ġteacher":4701,"Ġprefer":4702,"vis":4703,"Ġthread":4704,"ĠMatt":4705,"Ġmanager":4706,"ãĥ»":4707,"Ġprofessional":4708,"ĠVol":4709,"Ġnotes":4710,"These":4711,"ula":4712,"Ġfresh":4713,"ented":4714,"uzz":4715,"edy":4716,"clusion":4717,"ĠRel":4718,"Ġdoubt":4719,"EO":4720,"Ġopened":4721,"ĠBit":4722,"Advertisement":4723,"Ġguess":4724,"ĠUN":4725,"Ġsequ":4726,"Ġexplain":4727,"otten":4728,"Ġattract":4729,"aks":4730,"Ġstring":4731,"Ġcontext":4732,"ossible":4733,"ĠRepublicans":4734,"Ġsolid":4735,"Ġcities":4736,"Ġasking":4737,"Ġrandom":4738,"ups":4739,"uries":4740,"arant":4741,"dden":4742,"gl":4743,"ĠFlorida":4744,"Ġdepend":4745,"ĠScott":4746,"Ġ33":4747,"ĠiT":4748,"icon":4749,"Ġmentioned":4750,"Ġ2000":4751,"Ġclaimed":4752,"Ġdefinitely":4753,"ulf":4754,"Ġcore":4755,"Ġopening":4756,"ĠConst":4757,"which":4758,"ĠTra":4759,"AG":4760,"72":4761,"Ġbelieved":4762,"ada":4763,"Ġ48":4764,"ĠSecurity":4765,"yright":4766,"ĠPet":4767,"ĠLou":4768,"Ġholding":4769,"================":4770,"Ġice":4771,"Ġbrow":4772,"Ġauthorities":4773,"host":4774,"word":4775,"Ġscore":4776,"ĠDiv":4777,"Ġcells":4778,"Ġtransl":4779,"Ġneighbor":4780,"Ġremove":4781,"uct":4782,"Ġdistrict":4783,"ĠAccording":4784,"Ġworse":4785,"Ġconcerns":4786,"Ġpresidential":4787,"Ġpolicies":4788,"ĠHall":4789,"73":4790,"Ġhus":4791,"AY":4792,"Ġ2006":4793,"ĠJud":4794,"Ġindependent":4795,"ĠJustice":4796,"iliar":4797,"print":4798,"ighter":4799,"Ġprotection":4800,"zen":4801,"Ġsudden":4802,"house":4803,"ĠJes":4804,"PR":4805,"ĠInf":4806,"Ġbul":4807,"Ġ_":4808,"ĠService":4809,"ĠPR":4810,"Ġstrategy":4811,"ffect":4812,"Ġgirls":4813,"Ġmissing":4814,"oyal":4815,"ĠTeam":4816,"ulated":4817,"Ġdat":4818,"Ġpolitics":4819,"abor":4820,"According":4821,"Ġspell":4822,"Ġgraph":4823,"orthern":4824,"TC":4825,"Ab":4826,"Ġlabor":4827,"isher":4828,"Ġkick":4829,"ĠiTunes":4830,"Ġsteps":4831,"poses":4832,"Ġsmaller":4833,"En":4834,"bert":4835,"Ġroll":4836,"Ġresearchers":4837,"Ġclosed":4838,"Ġtransport":4839,"Ġlawy":4840,"________________":4841,"ĠChicago":4842,"Ġaspect":4843,"Ġnone":4844,"Ġmarriage":4845,"96":4846,"Ġelements":4847,"ĠFre":4848,"ĠSal":4849,"Ġdram":4850,"FC":4851,"top":4852,"equ":4853,"Ġhearing":4854,"Ġsupported":4855,"Ġtesting":4856,"cohol":4857,"Ġmassive":4858,"Ġstick":4859,"Ġguard":4860,"isco":4861,"phone":4862,"From":4863,"However":4864,"Ġborder":4865,"Ġcopy":4866,"ography":4867,"list":4868,"71":4869,"Ġowner":4870,"class":4871,"ruit":4872,"rate":4873,"ĠOnce":4874,"Ġdigital":4875,"Ġtask":4876,"ERS":4877,"Ġincred":4878,"tes":4879,"++":4880,"ĠFrance":4881,"Ġbreat":4882,"owl":4883,"Ġissued":4884,"ĠWestern":4885,"Ġdetect":4886,"Ġpartners":4887,"Ġshared":4888,"ĠCall":4889,"Ġcancer":4890,"ache":4891,"ribe":4892,"Ġexplained":4893,"Ġheat":4894,"{\"":4895,"Ġinvestment":4896,"ĠBook":4897,"Ġwood":4898,"Ġtools":4899,"ĠAlthough":4900,"Ġbelief":4901,"Ġcrisis":4902,"Ġge":4903,"ĠMP":4904,"Ġoperation":4905,"type":4906,"~~":4907,"ga":4908,"Ġcontains":4909,"anta":4910,"Ġexpress":4911,"ĠGroup":4912,"ĠJournal":4913,"ka":4914,"Ġamb":4915,"ĠUSA":4916,"Ġfinding":4917,"Ġfunding":4918,"how":4919,"Ġestablished":4920,"ideos":4921,"Ġdegree":4922,"Ġdangerous":4923,"anging":4924,"Ġfreedom":4925,"pport":4926,"outhern":4927,"Ġchurch":4928,"Ġcatch":4929,"ĠTwo":4930,"Ġpresence":4931,"ĠGuard":4932,"Up":4933,"Ġauthority":4934,"ĠProject":4935,"Ġbutton":4936,"Ġconsequ":4937,"Ġvalid":4938,"Ġweak":4939,"Ġstarts":4940,"Ġreference":4941,"ĠMem":4942,"\")":4943,"UN":4944,"orage":4945,"ĠOpen":4946,"Ġcollection":4947,"ym":4948,"gency":4949,"Ġbeautiful":4950,"ros":4951,"Ġtells":4952,"Ġwaiting":4953,"nel":4954,"Ġproviding":4955,"ĠDemocrats":4956,"Ġdaughter":4957,"Ġmaster":4958,"Ġpurposes":4959,"ĠJapanese":4960,"Ġequal":4961,"Ġturns":4962,"Ġdocuments":4963,"Ġwatching":4964,"Res":4965,"Ġran":4966,"2014":4967,"Ġreject":4968,"ĠKorea":4969,"Ġvictims":4970,"Level":4971,"erences":4972,"Ġwitness":4973,"Ġ34":4974,"Ġreform":4975,"coming":4976,"Ġoccup":4977,"Ġcaught":4978,"Ġtraffic":4979,"ading":4980,"Ġmodels":4981,"ario":4982,"Ġserved":4983,"Ġbatter":4984,"uate":4985,"ĠSecretary":4986,"Ġagreed":4987,"Ġtruly":4988,"ynam":4989,"ĠRet":4990,"Ġunits":4991,"ĠResearch":4992,"hand":4993,"azine":4994,"ĠMike":4995,"Ġvariety":4996,"otal":4997,"Ġamazing":4998,"<|endoftext|>":4999} \ No newline at end of file diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index d5af66d62..f31134857 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -3,7 +3,8 @@ from megatron.initialize import initialize_megatron # from megatron.data.data_samplers import MegatronPackedRandomSampler -from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +# from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets #Initialize Megatron with dummy variables initialize_megatron( From 71fb5aea7f12fbe3691aaa7320a75b89c8898a3a Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 23 Jun 2022 21:16:48 +0700 Subject: [PATCH 151/297] adapted test loader --- tests/test_packing_dataloader.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index f31134857..f89cd773c 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -20,7 +20,7 @@ ) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=["tests/data/gpt2/meg-gpt2-openwebtext_text_document"], + data_prefix=["tests/data/t0/ag_news_prompt_text_document"], data_impl="mmap", splits_string="90,5,5", train_valid_test_num_samples=[100,100,100], @@ -29,10 +29,17 @@ skip_warmup=True ) -dl = torch.utils.data.DataLoader( - train_ds, - batch_size=4, - # batch_sampler=batch_sampler, - num_workers=4, - pin_memory=True - ) +print("Test show dataset") +for idx in range(0,4): + line = train_ds[idx] + print(len(line)) + print(line) + + +# dl = torch.utils.data.DataLoader( +# train_ds, +# batch_size=4, +# # batch_sampler=batch_sampler, +# num_workers=4, +# pin_memory=True +# ) From be0cea2d0e29ecc3cdee9d222ac85e36c520eac5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 08:34:29 +0700 Subject: [PATCH 152/297] Update megatron/data/non_causal_mtf_dataset.py Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com> --- megatron/data/non_causal_mtf_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index cae8d4a54..cd46b6b58 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -264,10 +264,8 @@ def __len__(self): def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] - doc_idx = self.sample_idx[idx][0] - sample = self.indexed_dataset.get( - self.doc_idx[doc_idx] + self.doc_idx[idx] ) eod_idx = np.where(sample == self.tokenizer.eod)[0] From 9daa3766d21546397d7a3cf257f86ac21cf343ba Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 13:16:37 +0700 Subject: [PATCH 153/297] removed unused files --- megatron/data/data_samplers.py | 1 + .../data/t0/ag_news_prompt_text_document.bin | Bin 18494 -> 12526 bytes .../data/t0/ag_news_prompt_text_document.idx | Bin 2042 -> 2042 bytes tests/data/t0/gpt2-tiny-merges.txt | 4744 ----------------- tests/data/t0/gpt2-tiny-vocab.json | 1 - tests/test_packing_dataloader.py | 23 + 6 files changed, 24 insertions(+), 4745 deletions(-) delete mode 100644 tests/data/t0/gpt2-tiny-merges.txt delete mode 100644 tests/data/t0/gpt2-tiny-vocab.json diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index ee436debb..907a82371 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -47,6 +47,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): elif args.dataloader_type == 'packed': batch_sampler = MegatronPackedRandomSampler( sequence_length=args.seq_length, + dataset=dataset, total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=args.micro_batch_size, diff --git a/tests/data/t0/ag_news_prompt_text_document.bin b/tests/data/t0/ag_news_prompt_text_document.bin index 49b142d487d7ad6663aa0abaace9906c18aa7e79..60646247e5037a6b277473adb47a2864b90408dd 100644 GIT binary patch literal 12526 zcmd6Nd3=m#*Z*~{dzs~)d-j=RlF3Xm*^x~WqLWMrNsT0yh}aSlNr;pXJ3&xEP*jy_ z(G<0mDi3WfMXA>EXerT!(DtpRbfwje@;g`ZdiDK1&*b_1{`>vmGct4E*E!d@&hkCq zbID3S|2uq8$adH&u24J32V{1CC6!4=+R91c=BS<`xYgxQl%OQv1)J%>NDb+49|aOw zYC0w7!OJ_{Bwgo#?VU7a50^_uauTV7K*RUCySPU0W3LbGeN6rr3Hu}3LpWR-#}^-h zV*@U3dA$BF?qJhx-IB%O5O0!*o2)c80)RWj3zgznPDAE&HiHZ9MY+in+Foy8HakIt zU0R(akw$nqL2q=ESzN@pba;)QP96B$B!zD+sV6a*O;p=A;i{tqF2)Ia zg1hOCQa9OFyIBixu0v7YDSF=GCVxb$Z4$A;PtqCDMAIQt=UZ~s_PvI*Lw2|f8uGMi zqbQNo;(B`Q89Qi&gW7V4?xrM-E;CJr_n>$1*W6FiUV}sm2K5p;7+ml^H&NI!ULr3j z0P5*Q>DV*bA%pp66jhvv@9#E_4fHMf5B_(}3Ui%CB9plfdoJ)_33J59duHcQXwVuL zsM6tCjhiS9`@@e4Pu6*H_rqx^?rpE(^5R^viTDZW>HzYV>?WN_PN+oI#!Te^YIP0S zn;=zN?{Sk*@;03<%cLA4?0-ozi6u&(-ng;CT=!AF6TTgB!>=4xOG>g%Hq+7uc9tJ% z@?fKXxj0FoPYy9J(KNs#>3Sj)tN5nGJaD=!2=@Q^#g9~=#6NTg+e<`*XZoD7>dhva zsP%0Z`>VC8dhX-QYC0~u9HvUsg{9iPMwu8mCD96x|1#5_6&3n|X5_UvxZ{89Q>m1QhePXmj#69$Dl}tWeZ|xO9|IR?hwb`Q`saonoQr;A zI;wkF?96=(mIxU=Mj|y(q?1S$3~xJt|Hd>C-=k8 zYa$C1{v6%nf&VVIH4)JNzvj08JFEO}8Gp|#O~i%y<6kpN6S?dB_nD=M4A%XpyizA^ ziT1**ckQreP_jltel2Wl(?n8|g1ZL&cdVJ|tanetIViNS zHJqYeM1CQBZFzJaDIqOS{v@gR zZ?th-fU=2n(VjKg!C$+vIX_Ee*1IPZO=JkL6)YRF$OqEG&Mg1!z}WZY2k*q>Zn&}d zHH{rUDeh-)BIDq@Y%O`F!(&gVh2#`39Fv;JY&ln*N6zS0rYzAv$Jh9c6;i1k{kw@6 zN4~_dJarV$@QC?!X_nY2V&{a<&{hEVqc*{(XuI|pD~9yAuD*2%CEF9#S>ad@)8Sm= z?C_`N{%?P&g)J@uv)i8R&SW)_qqIPnX#m(LK3cy?ZBc}B(I%D%R<-Z7!`J@V(F@^b4S7q@5aaM#ARS-e z9k4>@L5u#_T(NHpB78|(o{&-s#75X7PQcv0QPUXseYEej+v78hmJxmuIUmSSMZ31< zx(LAo4q=Ph3+@z_rCFs`ald2Ux&$E(Cmd5~$QM~GuWyD^q8+aDEUSL-REbbZcT){_ z6@vH-wMQ6@Cmi9t4~rbdU}?p#rb_HAt_eT)VCPYb)sX$&2p;dIW}n$8E|NJyur~OS zbz!WYt&t~2`DZOqNu;}idDV1zy~Xz)&kj05Z9@{_U6mK&gG6$!ozs7Iw^K9`O1S&F zHux>T(<9%)?Aph&31am6(D)#hUz>=exXkr!rT(ypXcKU0+r4BBd4;P5OVN8$VkV1# zJYZfgk;~fG5Pem(Tim^x+Mq#QjK1}1S0`NB_GT;f-jkBk?x6OApouz4`=aP_S1FRrUi1Ks6dH-t$->Zl({NYN6@BC{!9a3FTK->&n9n`J0!$Xv8CJngS zXR3LYD|$|rNE2!)f?4B$(Y8+mML6-A3zS0={d}*!KVq+8822W|o4=+OB1H$h6xg4y zr!F|k|HUmaEz#WbvK*f)E>pC_%EqwNN=qY{c;9Odd19n#bX~PDL-(0#q$v-WZr5$> z?*=@1v+lO_0}j2YS$`*f^GUEBrfZpnSVr)@?mIEbDfj(YezEh(DGr!#$Jjt*A;Jr~ z*QIEUlQJKRo#2I1s75<8FHTpl3zNtZwN7B~`CglLoNuo@kPaUOZ54l*v=noHQS2d< z;8h-_AzeGGG^9)Mkw}SLK;O8nn_VEnYL!{{71UrAd4N{y8tQxPcBHSQg%|1fJ8$c* z+bVSf6%4V-N#F16)t+U+;kskdRe=r&*z+2CuOK{W&}!f6=7I|fo2RlBd|apK26Nlr zkH71;37n+^axXYU{AAHp>$Jl>!3%#Wj#7b+rfLO4GQFIe;0MfV1-rWj@h>=Grt8HV z-&#-Uyv=Z1*Wry{{c2&Bct9n>P}4Sp+IUULPBFgCa;pdIuuHLu7L4)22JIl1@3m&` z)WdsooY>_okBccCU@RWZPAxC&y*W3mUKnV=jz#l@SmFv$)R<9Ia9QlIJnmzZ%k0DQ zq3^Y`mmLqc0}&ozp89&;ef#n7@q9jbV54k@D+U|5HAAt&84(@)oC#2XdHg478G7_2 zvFfX4`ub^Cms!!}O>$*djb5vB-OSTprxXj2fc?f0TAt|LB== z_-tPeB6;+uz@s$7a)ov@6U~NQPGI`^-pf>rB);YMbU>Cd-@DYlvii!yuC_j-OocMYX*70$z8*s2I8zr>da*9&Xl;*bP562bb8 zV(B4#7Q5W^6>RU2q&677D`g0Fb?1k$Txx`Dlqpcll-;dl5SEi|q|(m`N9c~I?ee?2 zBh;-L&|4%=U|ssfRY6&Ni_(0%1EZD6WO1IVWk@6soc_rv8gf3U1+~Z|N;pOL^S_~I zCE`;ra$CiE{ESv^6@BkAn3x9M?2!l4rmcxI4ZPELv-SZv!J(PQyI{ZlGa(LR|2KOc zROnZUdm|+BQs52cCa4~iLu!;R@CNLsF0CCjGx2Gtnmq=)J{Lr(e?hD;2P^!yC9A|M z%3?U|H-s|38PG}{5zBb=+=W7Lmhg*f-93|x8zJ3aF zKvM47b~3qUbi&jWiQI?HkRYFs0J5zIQZ^~)nQQO~7j&n~)h>FNb~3EP+$?do97@z< z3e5u#B7z|_>%ZkO93sL9Lmm{y(pGY5h=)mKxLHSRnANjI1=gWix_lWxpLQC;l@IW@ za$O(KMAZEAsJ3EYP$lgP_voIH;g(G966V;x5Z5gqPbzM-6Z-RaR4y16Fco{rzObJ@ z&$Yt}v6=4BOy<9)lQWC4YqINI5GxLVFS*}fRZ>Yc(;DEQ(N3;s=G%^MQ)R?bgBvRtv0O9?Y z*eZLK!}EduPj`e)wu@NJ-Gu>UsY)jOg;cHvPa-Chd1WJdq@+F5k;On6+P1D4PYzIj z4C6A`ZdS*5Vd6lRy(RPv(H>&lub+e+Mdy$#IH#A$#>gbniJV0a=5m^bL`!+(6upZz zDXN+85$)PiBtZDLmk=R&+u3XwIY)iuqV;v*|5m7RqF-M;~bVJljpB?lRWx@pQB<3-Vzi#&ZsK<)5Ng z{^a%@Q$=sg61RX6mMK=_lT!n2*kh!@uR=EYBU3|Mn#&ly9+0b-DRwA2A?s2l4CYge zOVQ$9xb9FNX15Ea%+RH@?V%e|~R&jmXAt`jIy+yYG@}kL4`^$8eu@(c-N)nCx(!H2Q$LK>`c^8DU553T+(%f%@($I5DX_;ccTZ~ibQph<6 zqnFR#$K=-27Y-Q zx>inln4)iI8>xl0ZRaIics^LkP`VTTKz4SK2|HBF^YluD9qPGM`VdrzQkd1oxL+y9 zo=;1SHo|(#kN6_3+T{TA2H!h-f5D#d+p|x?fA!1hSoo9&6t5hmao7zciT{w82Os6l ztzf>4qMq{3z<)k*Ch!(F*Hft}6JP0NrK6O?&{Bu#X&4A5&WeoGEODOT@X0I$ci0Kf za$O1;vde>iAg9N2LmsSxu}@xxYFcFfnv;o5br~G^eHP>@KS4YD-cy|-I`<;3 zEYx+)jb;8iL*XE0nEl3CY-AvuhA3nd7`y2OE7v>Y59&*!vv7hdT{>UihSmo!^Us5m z`a0-i%WB7}PMW0#R54K23Yc)!YJA z*U-_l3=vN@{SZ4s_EyWPf6<6$#`r4yMe{+2WP~q`=0!cFazKt0m?)8B!z%dm+ZEHawIxC{NESiuj4T5%S} zW<$K)OfN$u;yV_LYYbTsK^;P|MjVUbu1+3H+{jifph zsUtA=Bn4bkjpFLDDra-GFgsun|Fdcx6p7DT!c5qYa)UcN;AK1!|DjE>cv3hsJC%Tc z(Rer0%A>eJKVR;mfrf{OM9*2A@O$W0>?m$Wx}e(1PA>`HP;o0p>=9owb+rlT?U}@l z_Awc0q40a>DhT$$SI$BeUbzSd?0kFpKn$Q6Q%JyyT*oEVDct zfIa=Td4pf@z2C9U18rYU{)DbY9cSxnT~-J+uZYegOyfhY?xFVSSS{Qhm=fTEu^iKq zc~>dipV)xt>rxNqZ!PbG{s?VL_R?}IY1pfIeF4Lym77QbtB_l@0eU+|ur**^yhPIQ z#9WwZwx;;j>n!MPR^6HSn>B0QHpJ-l!|Rngz7O=y@7j-P@z46{&SBy}Zjt{b-VVG;-J zZjtJu&rJ78`&5T_*NJTS6bN4@9?RP-0U4ijj)u~#ie33xYS%@3bR-= zdC@F?A^Nh_68R@OCZHOuIQH018m><+@1nDw-tHX!1}m_gmA zmmm)lveM!KZ}Se)#73<1tfKYB?u~J>iiBc65fYFKEki!*@B)dXWVnE7Et4I7R?770 ziX8x}!_S)bV=heAFbO{^@9>SH7m_m-$UL}kb!Hvkr+iOw8FWRioG0M4KkH{taJ%un zHk{M{GE6Qn5rSob-Wnzf1LacKjq{Flspb1nx_rDHzFpYd%@DtNGG>XLMmyYgN<`Ud z6qMK=;9g4t4=x`B653&b5ho&=A+#@|vlbaHDAs4bAtEOo%~kTFpps%X^nZr`k!Nx9 zsv;!vo8GPI-RPY|ZRC`0y!{!&uZkk&_q64I4st=wm_HGILqBx-7g;-|Pe7h7DR;Wl z-(Mor4FN;$AhL47;>ELh)(gL)O{LMwG~1f?4N!-+V&Ah)dX~5=jCtn8L0Q1ou0!M! zhF!>bo;~p-T#YK!S?ljBxK^^+zW88L*?*vYg$lV9ey_Mim>U*MuEa8iq7n9mixAjh z+PL?1?C)UXvj*g;kVCke7a2Af`M%j54jE1(Yp{-IZ{=a8-J#mj3c6g|IZ@w(jLRw% zSH_Pqv;Q}d?H~x}?K|Kz!3HY-6s)UZ}Xx#ZoMBqJf+Wn(xA@aa3*cY%grbHN^c7ZdXK&aF{iv2>& zFO0))Sx?gn0#Cx3P#eOgN{T1Hp0^x!!B4D4^K%5&0r`^fU6shIu zI90Olq_G@gh{uWZdHA|No={TD`^=VhC{s>S^fb8Cj0t}zY}I*TvF4(VRmtzROJojx zQ!2;12}}k~rO2B&=PpE!qie`28ZBIwt_Jw_y+7Mb>1HFxq}&Yvw}hO2`SMkc%?7T} z{RJZ>HV?2jPXbu9CwrK^b2qGrW0mq|be0AY9%9%UKRfbe7crC6LY1u_)><}2U{7FN zXRnHMVixnQL7X|!uv2pdanPl0cAmY-9%qU}(mJ)mb%_aU8ydE8fJPCh!n#<;c&>&VF=Y0zKo z%4at*Y7<_`EYmkYc?W>6^n;a7_!05fba|5E_5!>&W3UmcNLw z+_tZ(o6tLZ`vuVeJOLLn2EHZJj&}Gt=7{H*{V{9)6>`taM~;lRjvO>{`tmHyp-f(< zjE0S)F;6jmZ>sU6{_k=Y4r*fS3NuXzN)(`L?@DtMqaz&&>mW7|x7pjp{)z%pR(&FLm+jkl-*$*2>&3Zk%X^#@t9qni9UNg88)n1<`O zv>64vI^^LUCK2=_zZcT+b0kLoC8$H}I`!FN_}nl#{%J=`jT7N}`T`>ADX<35(tvo) zgDlNj|7*zWPDi#}1lAdea31@DL+Y81Yuc{FI>h&GMbeJUT)~*$J2fJ?7SY4)g!#Q) z6_4-8eCGOmjS;&Fham87#LOe)UN~b~j&>Iz4sUM_&)pN4fJ~GdXROmX#!G3qDqyQ> z&F4)(28zESnl8X9;3f-TB>fVruLytY&vvf^dNd02ft&2=jni0L0b-ovXg{WHuaU^S zN&oI9*WpAv82DQG1Sewm1OsD=!+Jc{mNEJcd~>-PZ!rZA)$Q8GGF3T@86SfY5JOVA zZ{-{9!^21QaT5jkSlJH0-Dgl@Y=lP`IL4%wLkPKMXWz%-)9H|?HDT3slVQ$=XShj|jool~C|`v){Y6-h zlOARXH#yYP0TENPu*1sGE>q0FPOJc}KW#DE9xZZ~i^TanJK=NaKPf^1R@*8#i#uyi zRzR3~Jpf4uBYnSl0_j`NbFY1?e(CwD9(D*1_QI*C_oLV;J-Q!Q3zwoD$bI=M@^J3X z?!u96I=mybTm7xNl2+)`C)8lx8;#n2!mlxC$N=qfwFglK&+WqMna@3kcefl}MCgoD zKo6XdoH^CaS<<&Yk>tl{f8Ec9GGyl_YzYi~e1oeNmJ|#eysgR$F27UpmykI==hwk< z8oAyQD+JxeTPuFT-5~*p7iQ6WIMq$gPKPpmV274FA2;LstE1apZm#6MrE3emiSqZq zB|H=NcN?%<7hma<6mIh8?sqlSbcR|(9vDT39Xgt$!aZn}GhrRETdmTu+R?h7-fs7z z(=*Cmw6M=V!`hL=Lks)J56#T)pd{bl)eoOi5k%Gk$RU+m7}twLXFa+6i=0WoKEZ@8mzErukWcwPnb-T@-#m!-qS?s-^C zPHXWF=6Kfm-a)27g#0pg^Rp07F%i~sZc>fx1;ASMJ0|9>Ei+=@c3kYq1Cll~?&jM- zL&*cYgXXY~Q7Xw){TQX0R@5z`V-)l7hC@BQZBF5j(6@Vu))JgrY+|!ldtfpRrJww!6JK-)s>m9p{7QRl!LD_*5lDF zKHvLT9bbm1quiVf9q7Zw4PZI&*dJSX#XQLFr6G}n7pvbA7m5ppH$Z>9tFT$S6sJ&% zg#gF3`S1nDX29R-0e(R?=tA5hPvyB{tgr08;ixQjQXnE#EUTB(i86zNjMd?TED^P zwj=K|oOm0e4gFvWUn_lFbg=p#kik;XL(WpbhHX#?JHi~oMtAh?Hn zRxV3(H$cCLO%P}^b!lCGB39CanXB9G?OWdC3E?m7c&-=zp8Plqz)IKtgPEuO-*euS z++^WKfE9j26c)KZ{Y#StNAR{nN!2m^D$EC6$fCFgIH;U~cDaQ5ezW_vepRrw_$K!0 zi!nFUFN~(wj4hR_b)62aeh{vJm!>ebF{5_sm4!Ek7vWvTB-f|wBIt2Mp=`FrZUAcA zRnxZXo$%FKnXo4wE_TA8EiQN#bo^$R#+i_f{L^?mb|T!=vELi;tuK|)&D_i8)&F|TgKTehus@ILPX=V9O_R+G(o>XIIJJ*K=Bl!B7K$r zCCXv#-B$ECKT8McTAoTeZFhaP&5qDb{0~|t`GYxZt~d@k3TU%7o2TH4X-2FoYR(4Z zjjQP^S<1c<$1Ib7@969Uw_TTV*i92RDOq}%yBR7`#GU6{qX zg2wC#cQPf>aSp0WQ#x9}Xc z+?ua%fJ_r*?Z|zpWa1fJeEadcQhuLzqf6A=`yhQ|r1671^Su)L`V%atjqL;)}X<>y8us1fl&mcVMn5~`_N40n+m1#?j z>xGWz?g+l2|46Y)d_nu(SxsYTfn&4jVj1Kp;4i>8=Ig)Rab;IS0&Z}jr-1Hqa+nzy zV#U*bnSrZ|u;*j=Xzbmdz^mej(Oa)(wPk^o#Gdfxunx@iT<5V_E^lYe6hxK2sFiJ? zT(>SY#Ix$L-F%+c%POcItIvM6y{tL*CpOQ{V2kApx<`cB4B8EEyO%t)m<&-ZD(FwP z$<}(~`Q4^ZQ#Sj$6Q(ak8L!Z%8IJ@G^8_Qz?sTfCMruE7I6>cKE1#Y1l`{FCzp2f} zLAndRFX%eWMl@^6EB%j?<#f^BHHW!zJj;z$2Mt&}ReBbQeux82X$oJa^<@32f@a#Y z7>cLu`ar5azBTL(W_*IwcdzANH5D$x3CC_97jJxge-U=FJbYwwTpOSR2 zR`Y#4|JD>59E^rhBkX8B{7Z#bzob6^sm9R}trYpe&<4{@)Pa|>I5U~n!iseI*c=s{ z1?h4FlRU**ynYLP?VCl3bU_~G$E|DKmWY)#x|(3xEFZGpDkbgbHbYEy=F2nU0pI(w zir%Kb^B(Z1t^Qm-T8^S^BE)SiDd(br zQj2=iU0O5xQf{De{4Jvize$Tq8|!ZUr;J$5mQj7=%mUs_ZwfD{$&bI68LJuj|F1IR zf662OUcTSvlWG=1rug^Sq?)DEzt1Mstjzb{XOac>GSWOnw1---N@T8K^ss2-IIJ^x0cr`M9eb$JzVs(EN;o^Dp`a|>%jfM@TQEP1~56~z|fR(A3Hi0*gK5YR>eG2dE znS&geBC2QtET=a6L{>-QTbOm@1GEa*PciF^=#@e$&)V!`U-%b!I%)xmgA83^XRF{B zg~%2Md6Bt=>d;6tR(op3By!9YrxI%(3~Pl>%$)Qw1umeaOTU#!+8B+s*}2+{n97w0?NP(RmbkG?e+2A|JJi zsFAND>mwC@P?*^xzDLG8PR-;RAd?s2qg~m0IwWhb$3{A+C(|dI%H);e2QgBP<~z6} zKBl@_Co5l0g3Z>Vp;XL&qHC$M98R@j_PUvN)M`%2-9Q@3b}E4zqn{n2P1DhFYHy0uf-+^8EksP~jwnGuMrodHF{6e!ZxhK>KptC?+_ z5dC7tT)&P$U+Y2oo7<7S3k*~jR`Z~!q&OO*C(sM_-}rf<;!BvK@x*2)+|8)-a{Q3NUVmnV%k=f}lrsN z9&Am0uRa?k6goRD!`}CSI=A{$X{4tt7@~W9x6r9TtZ!?@7E+O^s^2a2A|VFiPw~oV?1Zq(*uQH4&xSKy4AC>3;2J;ic{3PA8hJRW;mOUv6Xo zIjXAcRdL!mLUA#_HIp%Yd|ZEHB2AHVeD8b4$Au#+Jc^1$eNqJvi1+DU8Nfp_hVgc)B~a!w2t4esZ5iM_@gpv3iR8fq@rHV zBCqJf1TyYungUBuQKG<75yEEsET2@*h*ge8Kr8BPj8KIjhh3B|IHbw61~ub}j04^nmu zxNd*#s2-xD#t?B2O^BIGSYULai@ccsLu@j;z{0Mf4b+}8z#NQ3-WiQdp#So-ijmzt zepth8vWR{*_H)T6aSPcuU#Po-xO1StA5-kqUTqw(W>?l5k;;NGWz(CGKUSSJVK-V^ z0$=;O1;XqgJ&Y_oSKsT=X++$XfE7GtDw|#9Cef4puRI@eZ8ui&<`m#-U^x%F6&rAw8iW0fkCHS=|6D*|cAnoM3qmSl9Vo=r8M-lUbl zHr9KS>4QKbG%%umc z7i{I3{z&f5pn+x{Jt~*lDuTX;7+ar(!4~$zo+W`HxeC&K>%V+`1;w+5Rxi{q1?16G zF4)hn(V6&w*QFo5%vtGu4y?%KHO9JUjoBEo^;$F*(YlNm1~2MoXuLPcxCuXb#naI4 z1!D<3tBErama`W1g-&y=PxRL!jNoADh!j}Y{od%EvA#W%t>oVZM$^Z17)WxH(SSA6 z>#>nCo922iiCMIaTxta`izmfK4Yx1RNUZ~H@CeivACZ^7qSjU*km8G{&d}v_kwLn= zL@Vqg)Dx0Q?9n~qM|rh%HfB`xYebTWD)kZU@;=^2>&muzpZD$KIUbF63Z1nFA`FY; z&Nv_rl>u~H#l{rGY9+DC1-Y)OY{C&V6^-W(YevmHH8qi!HI@hYQ`&d9I}6v}FE;6m z@U@A#Nso0u(^wq!7q?S7-AyAYOwIK!6cl|alTSpImo> z^$*fy@RlKJY<+{;Vm`2&mltTO^bgG8w9QPQVO;svwG;x9jV<0(#JxbTdoMj0=`4fov5HYeo(iT?H|j{Y z>+K=yPqdL*d0(-C^HB>0-(WL% znDS924C95r$fBwmoyG>j^sY3B$BY3o*o&U&{-~Ua%kZ&8M5Ug(5y5kFeI;I^IiAh# zA!=Y=#P2J7D`-2Pij|KPV}aGL)~m!4n!x+fkGw0J$>ZrSf$9FP;yChCKe}6tlO?dR zM6E zZ5!;V7vCUT!CsEicyKy%w6&1M)ibf*8Po<$T`_wc{fN(v8F?Sx*#>c6c$P-q zutK)1=|7}*vM0i9jHi}Q!V6Ig-L8!eJ8fTpB{Y5KMlN6DezGt zz2gTW<2q~2+OfCL*U6)%$p207?1TK<2#$M}C*bb8426@UQVsW%Sh0CYCTk3w_L#gV zS_7?h1opnDP4>THl+h1TMa0j|LcBxuJdPli>A)XP!$x$vLpy``7=3HQz>eV40mIW> zp3rANuRnuD$b#O~o>)`|Sad7Brm4M|;5&ireG{MLE2I*difG#lHOPDLM)llSJv@tb zVO`||a<2X;1!=kQCp_l{S|Ss{CFRk1*^{-hufyJkoD+zq`Sd8bz~#m^qaFG)*UK{P znqXUx>QxredVePUz>8VASJN&<_~fJFufP;5g9IB)puCu)YD4s+*1?Ju>bsXN8NJzF zV<>XeBk1{D{!Vo`)bCn|ETBHc9E=pt0a?8w6sxrc*}WM1*hzaB%tI}D4p@Gs9tKih zZ!DDOuo@LP*V$X_+r$F&Cogza4`&74Lg&S3*QZde_95;kI%vEULBAKQTKIW*1ohWu z=h6Mh7pb&HbF_8fb!veTh{~hzZ>-w5pxr|1Ilo?W5_$vsMQ!)%;0=*?ZWTR#wGaP4 ztv(Gp`@1)ZM)C@1fk^tp)I&dsu52Fl)cSf{(zF%SlHW)x-5a0-owgY-qa!}qa;$Fj ztfqAN53!!cS(}^!`Xm1w78kvCyB*>R;f5o8+-Ues(64x&SoPDEjin>VUphFz!E`lP zf?Dv!8I)`xE3vLt8y=(-cEri2Z~0K?I%_mmQDv&D;<>8R4}yWqM;GlK8ctgR``k&0 z^k-tu%IG(-_rTw^0ZPlm`$zL~V+H05RPSaSdVW9C0Ye}HBeu{^?vM267V(#TX~@ay z_cr^?Ddr)rz7Nw^!lfk<&L@>_Hw`KS#}uNkWj~sWh(Hu}3_4*)+0^$;U(7Z=%E+eE zB9T=hMxFG`HB9b7CK;iBVhp7;Q?V$(x`=OHwk3}MvU!rm<#0r!y z7kLET;#V@QcX=?8DJJy|ZK%s4`;dY>(UDD#8F98`hxuA~yid!fYW^y0?Y!8csm~7k zGr^{&Bck_W8QOk+LL}hbt@X!k6(ja*@iAAk8LV9N@iv96e1*I@4CwiDF&Uk;d$b?r z1gBhoiqF>*_(XFQwE7%BM1v?!Bt-gjbM@O~N{r~O9(#};qq~ICv#L$Q^f>yfDrsf-OCa^Cy+@?kTrXo??&t-R_|C~WMO5;@Sc45;#BKflYm^W<71Ss z23tW(Vfr8N!6!wL){8ZSmz?Ay!RZ!9*w(rcM*Rl<1~6O*`&^_IqH0V-Uvv;F29uqR zNd^ZK7%SY$VwG4uvG483g#|I~3Es`S0KA>#1z_8503!Gf z`VIJ-f_FY8JHU^IV+x0;egvs3;ud{}>T9fVRF6%Lw_saGvByKr#^{%~*MsaCvo-x$ z7W0P4;C9?`y%(zmR6Ep*S`W6RkZuTjvzS|QjF!Zo6S2Lo5v$85(@N~Rs}GjT#7gW! z7uEn6d@We?Hd-U>@7ppP^^)ni&zIyoz?&KiA{B$es}p7CU~KQZp#2s3R0sa+NV`1- z&5+&seO!~rv?X*1mC8_SRGL7+$6RdQI5w(tX5pkmuRgP#ypIw0A3P<#bLS=J}&XR7G`&7tGBGuPs@8`cTY#2Ni!mBV^{qtOR?UC zbQ|^8`Y>{$2v|+K26UggDARdRKpnkiP` zW-W+ZlffFoulF1Ah)8QAlPP;p70t4n1AX6NEJXxTyNd3eWne1s1T{k4sW9Q{Kp{2c zDRQ`-g4*YHeIfiU)+jy5T_^WZ6(aRG-w(7{ucV$(op?IdRf$u!~Bw1EfWyW*1MIv)jX~743Bsc z{rk`8QGT1)sh=l}zBJECsiFkSiC_hn4s(xHfw>fiEys*Q0{E;hk;%hr#iOv8$7Q;l zu2rGx>cwt{=e;ORSI4B8j(MNQxXLrF@&03an+WPp0-t`px4mzdoJq}NuB7U*zPuZI z0~topyF6dpfqk6k?@Dc-+r$0f2Hp|n`W~k?Js629(R%zF zQHyWWCetpu5#0)+O>UAu8S$3$!+eXbrbzRks}OQdEmT&q+S6N9S0qt_ugE)1>un51 z7FX*@1WVG(ZiRfa2(xB=X^_8^-3Xp_BA74b9ka3T3b&8a2MOi*$AS0m!MpRo2CFM} zf)j@SN1j)+8#9bN{RiHIuY*tg0lF>(KI*4+qq^v(hj`69lE7KOX8kvtnXsp|@SXxZ z?=7*|b1cUEna&m?lHZRQe}IDM7%aw)4+fH2#MgRN2SW9_lqSw<>Z(cpp!TAvt~f%= z_=#XV<-&K4N6^-Be>|xeR|H=lj{2rM=1BX}wMHsk#mo3YT)!2wp0z2P_h!A>S!!w? zpzYRV*vHTKXQKjK2oCPAu#a-N-pa)uM(&BGm8Eyq3aL7w93rh_CS<#b=0Mk=@9qme5&8%w(#4 z$;T{kbYJrHBf#zo3--fqss4X9JmMVPN_mL=I<-O{y&V0$nl-AZq4p$1HS71X?~IX@ zz|~qhgT+ypID{JhDdg2QPE?L_=2`6s<}WkoMKaJu2;uFQH4>{$|7e_`?$li}`rc@d z7!fcOWBZEOfhRWw%X7e1zdvn^lxbec7g2Af%R%1J@ZpZE;Ge&%$ETs2*B%&T8}EpH z0=LEHVwM||cEDVZ1(WG5Z5OnAKlPyg@QfeKc)Ez|7#;O`!M}H6(-Bo|?D4{&%_cd+ zct6yRQ7J++-0v0h=y}Al6xl>f&?bU`{5T@(Qtc~ERVa@bi*$`v@?$i{Y!Gvv_|Z6S zEC-vM$PW72Qx;nqIByI!zvMOFr?8`z>cxTsgx=MjDBpZkdzx-Wl!v$%g*t;nSb!K{uB-F;yRZojHPMAXXo9leC4HjEP>O_~p5JoE9JP;$m%- zUWjKaY0&GZ`e31MLfxjG)(xLFXG6fvw%|XZ{*6`V>F5ZhF-Kd9m<)`^R(jBP(e~(% z$}-H#rLp+_x1GP_Q=g{=?+O$FVW*+5R9n7P6?M4xOQ3HC=-5&g89Z6ztM<^$L^ zdS50|Esx+Ufz?*=+L%zTxw_1J-V=>R)$*W)4uo%vc-*`bK3emfGP*7pW-su)=v#aM zw&R-s&L-gga{FOzq`Y2cTTQUzb?;`xRA z3=NPI+?tp^1i41d&D6ZjiJAXs4Q9A0Q*&<1Fi(rg0dyO|$kQ2xlC}9#b@8sk`3Geu zQ;?J2_1sIjvL2@BMoPs()ZE>|)5iB|u!5Rl5;7B=z9aCw3>hD>uf4vXjOa6!Zj-6p zoM!Y1guv+NbV+XV710vsm`|b1x)FY$FLKU&zRRv4Ow!}+m;q+HFl{{qor;Tev$>A1 z;!WfUdzn@m;oaA54 z;HW5F+i3>eunPXG<_!-<*fJfSSZNnJs5eoyt4YCRAgE)=6O}YovE=YObdE9E6TLD- zExqxy(^rlhT@tg0Ww2W*EAW`3uxHJLP_m|;u4Mla$X|6&CeoAUcM+7ZL)M1%MrV>f z<^A-|kV45BU4fSkHlmY>nELepk+*Vkv_o{KI4Zh1d$Eq8?q0u&g0J(rF=Lu~@Uu~v zTs-CJDJx_-(9$sa#>u0BmYS9~FiD`JTYEDN4`h-0eHG2}D{kl`f0>y|U!j(p?VV=C zQCsaV@-*JwDUiteV(#`I%nHYIKI0 z08uBwl0xA4W3?7FPm&CK$qSCqd!z4J2if#rXGuQ7oP_d}jYa}`@oJ}@@y!gG~cv=G|gJFcExMo^DYyz5_kkg|e-f_*SEx3@f0&<=?->WYU&!u28Fl4M@v47_ zl}c^hSzx;6U~QFjuWZG)xy&(9v)m87T#s&`)0j_DUCmzXwBA(oLN)(}yam?%tmRrh z;Mp?Z)+P8Fs}5SQhxBY3?yf-Hvz%VlZ}oOU)LX>I1{lvVw&{21OReJ`qI72fc!F`> zKil1Y>4GImHBp6OAavM<=#6Ner95Oll_3XB;3YK4zFUix4Q#+xqXVaA9zQ^bG!Qq& zuhAHwj(p9)XA0+~)5qFkO!zCjJRXt!I4wZ6-T)os;q*U&=gkao&Xy3UKnguk6sXge z{5~)dGiWs|G=Lb0IS=%K=7Pn^pf!l>v$YaDEy|ebbQqk>v(`9T?mS4R+|D>97W>qV z&N5hr?U0%Vu=EzJ8TyMeh_o@r26-#u#yC2`v-zLtMw~UP_~!|NniF<6QhjRaX$saY z2!0_2)npF%hOypaOuMaso#@m8=k``3i)YhXw-tJ8IheAYEmWt?5!a(~SMOd%Q`{{g zo)H_Qe}_6`0I*CBrufG5laY$x2mk+w*YyQpvzMX=)fmwz2i>_kVEIh&ivv*WybLY= z)}Aui9pEfZ*#GX-UPJaECYS?SrS}z~gGtL|e?exxG2(eMs56-L->z6qy`pZxH5x#fLzN|1ZDN%tOnTtP@Hv0Q_IEuk26mxLS?k+78 z{Dn?C`nN;vgLT|9L@BH5$U4LMTpQrWoxdW_t#PQ4~AtcDN7*bXY@Vf z8H#go{V|1{q*b~z{m-LIseZcKcmvf(fqyNodm%UeI*a+v_|6&axmhSb_({$EoMJ<5ol{Q1{9Erd zveb9r7uU!EK(5!*Ti5~23Q>LQAyD}R6yz^mVY2i0tTuyv0jvAMV%h|uVqh}e8uv6W zqFd!NyrJF=oRK>5R&&;;##1YC&Sud5C!GF!uRsa0**}6(ZhX6WBh3tNw4xDb#OrWSD+6BZF-Sc&3 z7QY=VRwwkgpF}Nu2y+3sh??!~Coq+u&U1yNpCtj!&*htBxzOnWtr$8k_eX1v=oycw zZw<8m4CHa*3K;tL)}rRD|AP3dCi%CTs!ID@_Qh=5l{BGFP<-TXZRD^=%yWU~ohf{d z<*+d{VXOg)Z};grsjiO3$8rAAaXC2ul#7|A1bWl= zt@tZHOHCk&s*Y1(tpQQYR0Lq&m$VVczy%b&KMX8&$jxC(^lVH=Di-)Gn0oNzk*m~+ z?;)`3ELwy3T+E7)bBWei7d%ICrs~R*$G^AEOm;Vg!BMt_q?4^o9?<%;osxJxb`jRl z0&!`wX|kFA$>7kHMkb;oHQBfgrxF%n2DqNMM%xGZdmy{$>_bt4`S8AAA7j^i*I)z0 z1}67wBjOa!?&PYOmkQ+E9?*VM8Y)eu`J>;(^UL1g#Y`fTtN7*n(sXsFs(RFP?})e~ zh-Q0GrM?anr+OJzKA-w~`?SEQL*=MH25Y70y|?8rNH1RwUUrOMRrr@9z^^;HVLBk= znZlNz^Ci|3zUvU z#sLYf04^A)&o|Q|eYSMT;X#+zk%xJaH0-IGcbG!Ga9%!{rm2ZKN8!F<*ykRYyX*i~ zVJuIgsjkWU1NAiyuJGba0BY;#lQHi|(I@{z^|-=~t_ysCQ)oJM3-m*OwIA=S55Nh4 z5m@mS5xhd(@Ovw2#S*|Mcf;geA8=89^>Oq)WGoIUiGIDFPT&dEHJ|&^=soXnNU+v9 zY4#4Nla*@n(}P)#+2C7h(H1Z<-T55EY8@+aSOd`sh$D?XF8nk`8vw5$I%%pC|FQdR zI@oN*PKRl^K0(||&m&vBhxvxTyX|Eoqvx+Q@Q(Na?~kJ|A|1p-V6JWU6y`s+fwL+$ zE2N{1rCR9sgn-P(2dDY`KYy39mx1lMXBicha^Dqy2 zVIWgnu^Ttj`OH?)e~ci?;|>ioYIz6ppPfS5hw}kx;6c?LFZx#@4|HTRsRyf~XUsX! z)N$<3PA7~$QWkpTVVuM6$?EH$f%#s7oKVWHB3Crwr-D(;7yCLJvIDxBXfqJOo<`J; zc7WAiBvD#j&jmQF;LM*RA`g$VT>#fI2u+CDPv|P)h_x=Ou5nQnT zBs}pg)V2qWmXKTJx|xU{HRm=@1v5>BS!0*K9ve!ZW2R}b-Uqt+PL$DJE#N(iZpm-u zJ!;*96Ja6n>}tMyCpw1eZ*S0umeEPf zeyLvh$lz>Xg@NojaTR)Up(`M&cG@yguKfkwwm9qx!T!5q2B1Ih4tpp?)mbTagVT7( zn+xl|2{Syut@VE|g0u!h8nxSXvJ)a>GJPX1fCJj^uBJ~gr}NMiZ21|litbU%QdL?h zy3`L@-FQFfGLc`P)%t0ly0eJR;cr!F)RjI#E=UDF|Cn}1Fo|N@ZWb%OEm7eDbHkrg za5nD0L4nZ-_>MSXw3nUCH0>KP9`oV&ivuhuRwGxQ(i;oKI%m@)dX?&-KUYC-`VW}( z;m7K)FJuF^)ch3BlQj=8_)^quoxG=l{SXB=;*44+Pa*!6!eX2p-a#jffn2?(A%7XN ztNA>2N_G=GyAfujaM~B0wN#u79Sfwb?uh1orT5KxFCf(r-GvFVjdTn4Pp2P(Px04~ WJ3jNNYpZb8Nwdinx^I`Y`2PUJ%MKU- diff --git a/tests/data/t0/ag_news_prompt_text_document.idx b/tests/data/t0/ag_news_prompt_text_document.idx index dd5d3206385265e3295a64f29d7d8e446e0a46e0..29e152ec640bb60eadafb52ac6bf6bea43e32e10 100644 GIT binary patch literal 2042 zcmZ9Nxla{Q6vhvVh=?2RJK}~2xFMT}$i9mT3ix~?B9&1>j4>8ghQb&NV~mA`r7_0J zSXlT65GyJRVu%H?FoqZl3k%OT=l2rhNxt{H=giEVxpTicucxP@xubc|EXyKt#~56P zN1RQrQ?8S!%X!+>;=1WN=4x>Dxm0g;wY$nuqw}z<%vI_-=-TZ%kG$2wfMTy3tqu5QMf z&)<)wTpH^>L#|<$p5=r~??r3OtW$I7oXak~2VE!X9qQf(T%9hhwc4fgbzJi`ARX6O zYu5YKG0mBYQ3l`OcvcMN;P9fz7vSXL$p664*^$>T34Vm>oXF?kjip=%2XiBT2dkGw z{v7^&z`8Y&=M)5I zU}|mT({OBEd z!QXIdd*tOif?wb>ncSnkFoC^+>0OZ*6a}BcJel6(0nFVK`6Kup-rF1XQhhKU^YG68 z$afS6-@;Ov+v72ugSShf{t4Ep33yDyVSNl9voKXg4ky$gJQiTTK5&l@uuhHQpWrhv z>~GlKu)Se-!{&y)4O<&FHf(Fy)c&)lVL!umhTROi88$QQW!THGlVKmjE`~h}TNrjQ z>|ofyuz|7tNsrU=3wZ}@d(z`nVw;U^HMTYBgF4Q3COuL)+m`faZvy`(fN#9n^ zHY9ysx%!+!{Y#-9rl<(maaakfU^T3PwNQOlQ5Q&hu7>sG>aB`~f+REUujT~^T|BrIBp zghWIu7cNBn3&KUTY$ZdBaM{vD&wHLn=!4(j)dFIQj8-bp~+3T z5xxWZf$!@8w?G%@2Jb-=7y!e-w+(^Ypc>SIUJwQ~pk6)$Uk2wuCGh}5=y22%AZ#;KS-mC(&?m}y@q*O9?oDMSWD-vqnG9H>zQMj z^iO$g19NT`Jt3EFWS)@=HZf1jg`1ft<-%;{S-ENp^DjBJl{qO7ZDY>aPLInaJD7jV z^Eu2@JL$PxI=PFU%%dxJ)5Uw}&|Z2$j^{I{?xR1+S^Jqsqhi z?xvcX>TRmEsm`Vvo9b(-uc@}Cx|-^0s;jB4rn;KyYO1TLuBMt=zOG00G}X~mM^pVw zwKLVuR6A4cOtmxB&Qv>7?M$^Z)y`BqQ|(N(Gu6&iH&fk$XRe>BY8E`7J*#HH^WU>- z7Q81st6rvBnQ9fhAAEn6e_*Ii!8^yZY7{(kJ*z&!`^EE0eXe-Gh)}`%%BKy=jdE0u s$yenjxmj+JugP(_RlY8_$?bB7d_(S(yX0=UNA8vTz>% diff --git a/tests/data/t0/gpt2-tiny-merges.txt b/tests/data/t0/gpt2-tiny-merges.txt deleted file mode 100644 index 64bc5dc65..000000000 --- a/tests/data/t0/gpt2-tiny-merges.txt +++ /dev/null @@ -1,4744 +0,0 @@ -#version: 0.2 - Trained by `huggingface/tokenizers` -Ġ t -Ġ a -h e -i n -r e -o n -Ġt he -e r -Ġ s -a t -Ġ w -Ġ o -e n -Ġ c -i t -i s -a n -o r -e s -Ġ b -e d -Ġ f -in g -Ġ p -o u -Ġa n -a l -a r -Ġt o -Ġ m -Ġo f -Ġ in -Ġ d -Ġ h -Ġan d -i c -a s -l e -Ġt h -i on -o m -l l -en t -Ġ n -Ġ l -s t -Ġ re -v e -Ġ e -r o -l y -Ġb e -Ġ g -Ġ T -c t -Ġ S -i d -o t -Ġ I -u t -e t -Ġ A -Ġ is -Ġ on -i m -a m -o w -a y -a d -s e -Ġth at -Ġ C -i g -Ġf or -a c -Ġ y -v er -u r -Ġ u -l d -Ġs t -Ġ M -' s -Ġ he -Ġ it -at ion -it h -i r -c e -Ġy ou -i l -Ġ B -Ġw h -o l -Ġ P -Ġw ith -Ġ 1 -t er -c h -Ġa s -Ġw e -Ġ ( -n d -i ll -Ġ D -i f -Ġ 2 -a g -er s -k e -Ġ " -Ġ H -e m -Ġc on -Ġ W -Ġ R -he r -Ġw as -Ġ r -o d -Ġ F -u l -at e -Ġa t -r i -p p -o re -ĠT he -Ġs e -u s -Ġp ro -Ġh a -u m -Ġa re -Ġd e -a in -an d -Ġo r -ig h -es t -is t -a b -r om -Ġ N -t h -Ġc om -Ġ G -u n -o p -0 0 -Ġ L -Ġn ot -es s -Ġe x -Ġ v -re s -Ġ E -e w -it y -an t -Ġb y -e l -o s -or t -o c -q u -Ġf rom -Ġha ve -Ġs u -i ve -ou ld -Ġs h -Ġth is -n t -r a -p e -igh t -ar t -m ent -Ġa l -u st -en d -- - -al l -Ġ O -ac k -Ġc h -Ġ le -i es -re d -ar d -â Ģ -ou t -Ġ J -Ġa b -e ar -i v -al ly -ou r -o st -g h -p t -Ġp l -as t -Ġc an -a k -om e -u d -T he -Ġh is -Ġd o -Ġg o -Ġh as -g e -' t -Ġ U -r ou -Ġs a -Ġ j -Ġb ut -Ġw or -Ġa ll -e ct -Ġ k -am e -Ġw ill -o k -Ġw he -Ġthe y -id e -0 1 -f f -ic h -p l -t her -Ġt r -. . -Ġin t -i e -u re -ag e -Ġn e -i al -a p -in e -ic e -Ġm e -Ġo ut -an s -on e -on g -ion s -Ġwh o -Ġ K -Ġu p -Ġthe ir -Ġa d -Ġ 3 -Ġu s -at ed -ou s -Ġm ore -u e -o g -ĠS t -in d -i ke -Ġs o -im e -p er -. " -b er -i z -a ct -Ġon e -Ġsa id -Ġ - -a re -Ġyou r -c c -ĠT h -Ġc l -e p -a ke -ab le -i p -Ġcon t -Ġwh ich -i a -Ġ im -Ġab out -Ġwe re -ver y -u b -Ġh ad -Ġ en -Ġcom p -, " -ĠI n -Ġu n -Ġa g -i re -ac e -a u -ar y -Ġw ould -as s -r y -Ġ âĢ -c l -o ok -e re -s o -Ġ V -ig n -i b -Ġof f -Ġt e -v en -Ġ Y -i le -o se -it e -or m -Ġ2 01 -Ġre s -Ġm an -Ġp er -Ġo ther -or d -ul t -Ġbe en -Ġl ike -as e -an ce -k s -ay s -ow n -en ce -Ġd is -ct ion -Ġan y -Ġa pp -Ġs p -in t -res s -ation s -a il -Ġ 4 -ic al -Ġthe m -Ġhe r -ou nt -ĠC h -Ġa r -Ġ if -Ġthe re -Ġp e -Ġy ear -a v -Ġm y -Ġs ome -Ġwhe n -ou gh -ac h -Ġth an -r u -on d -ic k -Ġo ver -ve l -Ġ qu -Ċ Ċ -Ġs c -re at -re e -ĠI t -ou nd -p ort -Ġal so -Ġp art -f ter -Ġk n -Ġbe c -Ġt ime -en s -Ġ 5 -op le -Ġwh at -Ġn o -d u -m er -an g -Ġn ew --- -- -Ġg et -or y -it ion -ing s -Ġj ust -Ġint o -Ġ 0 -ent s -o ve -t e -Ġpe ople -Ġp re -Ġit s -Ġre c -Ġt w -i an -ir st -ar k -or s -Ġwor k -ad e -o b -Ġs he -Ġo ur -w n -in k -l ic -Ġ1 9 -ĠH e -is h -nd er -au se -Ġh im -on s -Ġ [ -Ġ ro -f orm -i ld -at es -ver s -Ġon ly -o ll -Ġs pe -c k -e ll -am p -Ġa cc -Ġb l -i ous -ur n -f t -o od -Ġh ow -he d -Ġ ' -Ġa fter -a w -Ġat t -o v -n e -Ġpl ay -er v -ic t -Ġc ould -it t -Ġa m -Ġf irst -Ġ 6 -Ġa ct -Ġ $ -e c -h ing -u al -u ll -Ġcom m -o y -o ld -c es -at er -Ġf e -Ġbe t -w e -if f -Ġtw o -oc k -Ġb ack -) . -id ent -Ġu nder -rou gh -se l -x t -Ġm ay -rou nd -Ġp o -p h -is s -Ġd es -Ġm ost -Ġd id -Ġad d -j ect -Ġin c -f ore -Ġp ol -on t -Ġag ain -cl ud -ter n -Ġkn ow -Ġne ed -Ġcon s -Ġc o -Ġ . -Ġw ant -Ġse e -Ġ 7 -n ing -i ew -ĠTh is -c ed -Ġe ven -Ġin d -t y -ĠW e -at h -Ġthe se -Ġp r -Ġu se -Ġbec ause -Ġf l -n g -Ġn ow -ĠâĢ ĵ -c om -is e -Ġm ake -Ġthe n -ow er -Ġe very -ĠU n -Ġse c -os s -u ch -Ġe m -Ġ = -ĠR e -i ed -r it -Ġin v -le ct -Ġsu pp -at ing -Ġl ook -m an -pe ct -Ġ 8 -ro w -Ġb u -Ġwhe re -if ic -Ġyear s -i ly -Ġd iff -Ġsh ould -Ġre m -T h -I n -Ġe v -d ay -' re -ri b -Ġre l -s s -Ġde f -Ġr ight -Ġs y -) , -l es -00 0 -he n -Ġth rough -ĠT r -_ _ -Ġw ay -Ġd on -Ġ , -Ġ1 0 -as ed -Ġas s -ub lic -Ġre g -ĠA nd -i x -Ġ very -Ġin clud -ot her -Ġim p -ot h -Ġsu b -ĠâĢ Ķ -Ġbe ing -ar g -ĠW h -= = -ib le -Ġdo es -an ge -r am -Ġ 9 -er t -p s -it ed -ation al -Ġb r -Ġd own -Ġman y -ak ing -Ġc all -ur ing -it ies -Ġp h -ic s -al s -Ġde c -at ive -en er -Ġbe fore -il ity -Ġwe ll -Ġm uch -ers on -Ġth ose -Ġsu ch -Ġ ke -Ġ end -ĠB ut -as on -t ing -Ġl ong -e f -Ġth ink -y s -Ġbe l -Ġs m -it s -a x -Ġo wn -Ġpro v -Ġs et -if e -ment s -b le -w ard -Ġsh ow -Ġp res -m s -om et -Ġo b -Ġs ay -ĠS h -t s -f ul -Ġe ff -Ġg u -Ġin st -u nd -re n -c ess -Ġ ent -ĠY ou -Ġgo od -Ġst art -in ce -Ġm ade -t t -st em -ol og -u p -Ġ | -um p -Ġhe l -ver n -ul ar -u ally -Ġa c -Ġm on -Ġl ast -Ġ2 00 -1 0 -Ġst ud -u res -ĠA r -sel f -ar s -mer ic -u es -c y -Ġm in -oll ow -Ġc ol -i o -Ġm od -Ġc ount -ĠC om -he s -Ġf in -a ir -i er -âĢ Ķ -re ad -an k -at ch -e ver -Ġst r -Ġpo int -or k -ĠN ew -Ġs ur -o ol -al k -em ent -Ġus ed -ra ct -we en -Ġs ame -ou n -ĠA l -c i -Ġdiff ere -Ġwh ile ----- ---- -Ġg ame -ce pt -Ġs im -.. . -Ġin ter -e k -Ġre port -Ġpro du -Ġst ill -l ed -a h -Ġhe re -Ġwor ld -Ġth ough -Ġn um -ar ch -im es -al e -ĠS e -ĠI f -/ / -ĠL e -Ġre t -Ġre f -Ġtr ans -n er -ut ion -ter s -Ġt ake -ĠC l -Ġcon f -w ay -a ve -Ġgo ing -Ġs l -u g -ĠA meric -Ġspe c -Ġh and -Ġbet ween -ist s -ĠD e -o ot -I t -Ġe ar -Ġagain st -Ġh igh -g an -a z -at her -Ġex p -Ġo p -Ġin s -Ġg r -Ġhel p -Ġre qu -et s -in s -ĠP ro -is m -Ġf ound -l and -at a -us s -am es -Ġp erson -Ġg reat -p r -Ġs ign -ĠA n -' ve -Ġs omet -Ġs er -h ip -Ġr un -Ġ : -Ġt er -ire ct -Ġf ollow -Ġd et -ic es -Ġf ind -1 2 -Ġm em -Ġc r -e red -e x -Ġex t -ut h -en se -c o -Ġte am -v ing -ou se -as h -at t -v ed -Ġsy stem -ĠA s -d er -iv es -m in -Ġle ad -ĠB l -c ent -Ġa round -Ġgo vern -Ġc ur -vel op -an y -Ġc our -al th -ag es -iz e -Ġc ar -od e -Ġl aw -Ġre ad -' m -c on -Ġre al -Ġsupp ort -Ġ1 2 -.. .. -Ġre ally -n ess -Ġf act -Ġd ay -Ġb oth -y ing -Ġs erv -ĠF or -Ġth ree -Ġw om -Ġm ed -od y -ĠThe y -5 0 -Ġex per -t on -Ġe ach -ak es -Ġc he -Ġc re -in es -Ġre p -1 9 -g g -ill ion -Ġg rou -ut e -i k -W e -g et -E R -Ġm et -Ġs ays -o x -Ġd uring -er n -iz ed -a red -Ġf am -ic ally -Ġha pp -ĠI s -Ġch ar -m ed -v ent -Ġg ener -i ent -p le -i et -re nt -1 1 -v es -pt ion -Ġ2 0 -form ation -Ġc or -Ġoff ic -ie ld -Ġto o -is ion -Ġin f -Ġ Z -t he -o ad -Ġp ublic -Ġpro g -r ic -* * -Ġw ar -Ġp ower -v iew -Ġf ew -Ġl oc -Ġdiffere nt -Ġst ate -Ġhe ad -' ll -Ġp oss -Ġst at -re t -ant s -Ġv al -Ġis s -Ġc le -i vers -an c -Ġex pl -Ġan other -Ġ Q -Ġa v -th ing -n ce -W h -Ġch ild -Ġs ince -i red -l ess -Ġl ife -Ġde velop -itt le -Ġde p -Ġp ass -ã ĥ -Ġt urn -or n -Th is -b ers -ro ss -ĠA d -Ġf r -Ġres p -Ġsec ond -o h -Ġ / -Ġdis c -Ġ & -Ġsomet hing -Ġcomp le -Ġ ed -Ġf il -Ġmon th -a j -u c -Ġgovern ment -Ġwith out -Ġle g -Ġd ist -Ġp ut -Ġqu est -an n -Ġpro t -2 0 -Ġne ver -i ence -Ġle vel -Ġar t -Ġth ings -Ġm ight -Ġeff ect -Ġcont ro -Ġc ent -Ġ1 8 -Ġall ow -Ġbel ie -ch ool -ot t -Ġinc re -Ġfe el -Ġres ult -Ġl ot -Ġf un -ot e -Ġt y -ere st -Ġcont in -Ġus ing -Ġb ig -2 01 -Ġas k -Ġb est -Ġ ) -I N -Ġo pp -3 0 -Ġnum ber -in ess -S t -le ase -Ġc a -Ġm ust -Ġd irect -Ġg l -Ġ < -Ġop en -Ġp ost -Ġcom e -Ġse em -ord ing -Ġwe ek -ate ly -it al -Ġe l -ri end -Ġf ar -Ġt ra -in al -Ġp ri -ĠU S -Ġpl ace -Ġfor m -Ġto ld -" : -ain s -at ure -ĠTr ump -Ġst and -Ġ # -id er -ĠF r -Ġne xt -Ġs oc -Ġp ur -Ġle t -Ġl ittle -Ġh um -Ġ i -r on -1 5 -Ġ1 5 -Ġcomm un -Ġm ark -ĠThe re -Ġw r -ĠTh at -Ġin formation -w ays -Ġb us -a pp -Ġinv est -m e -Ġh ard -ain ed -e ad -Ġim port -Ġapp ro -Ġt est -Ġt ri -Ġre st -os ed -Ġf ull -Ġc are -ĠS p -Ġc ase -O N -Ġs k -Ġl ess -Ġ + -Ġpart ic -ĠP l -ab ly -u ck -is hed -ch n -b e -Ġl ist -at or -Ġto p -Ġad v -ĠB e -ru ct -Ġd em -r ation -l ing -g y -re en -g er -Ġh ome -Ġle ft -Ġbet ter -Ġd ata -Ġ1 1 -Ġatt ack -Ġpro ble -l ine -ard s -Ġbe h -r al -ĠH ow -ĠS he -ar ge -Ġ -- -: // -Ġb ro -ĠP h -at s -Ġbu ild -w w -id ed -a im -as es -en cy -Ġm ain -in ed -Ġinclud ing -Ġ { -Ġg ot -Ġint erest -Ġke ep -Ġ X -Ġe as -ain ing -Ġcl ass -âĢ ¦ -ĠN o -Ġv ar -Ġsm all -amp le -A T -Ġ ide -ĠS o -Ġre ce -Ġpol it -Ġm ov -Ġpl an -Ġper cent -iv ing -Ġc amp -Ġp ay -1 4 -s c -is ed -Ġu nt -one y -pl oy -== == -Ġdid n -ĠI nd -el s -ert ain -Ġp os -__ __ -i ver -Ġpro cess -Ġprog ram -if ied -ĠR ep -1 6 -u ro -olog y -at ter -in a -Ġn ame -ĠA ll -Ġf our -Ġret urn -v ious -b s -Ġcall ed -Ġm ove -ĠS c -ir d -Ġgrou p -Ġb re -Ġm en -Ġc ap -t en -e e -Ġd ri -le g -he re -uth or -Ġp at -Ġcur rent -id es -Ġp op -t o -ent ion -Ġal ways -Ġm il -Ġwom en -Ġ1 6 -Ġo ld -iv en -ra ph -ĠO r -r or -ent ly -Ġn ear -ĠE x -re am -s h -Ġ1 4 -Ġf ree -iss ion -st and -ĠC on -al ity -us ed -1 3 -Ġdes ign -Ġch ange -Ġch ang -Ġb o -Ġv is -em ber -Ġb ook -read y -Ġk ill -2 5 -pp ed -Ġa way -Ġab le -Ġcount ry -Ġcon st -ar n -Ġor der -A R -i or -i um -or th -1 8 -ail able -Ġs w -Ġm illion -Ġ1 3 -at ic -t ed -ĠG o -Ġo per -en g -Ġth ing -aj or -con om -ĠCom m -Ġwh y -u red -ur al -Ġs chool -b y -ĠM ar -Ġa ff -Ġd ays -Ġan n -us h -an e -I f -e g -Ġpro f -Ġhe alth -ou th -B ut -ion al -. , -Ġs ol -Ġal ready -Ġ3 0 -Ġchar act -H e -Ġf riend -E S -i ans -ic le -' d -ĠO n -Ġle ast -Ġp rom -Ġd r -Ġh ist -it her -Ġ est -i qu -1 7 -s on -Ġte ll -Ġt alk -oh n -o int -le ction -A N -Ġunt il -au gh -Ġl ater -Ġ ve -Ġv iew -end ing -iv ed -Ġwor d -w are -Ġc ost -Ġen ough -Ġg ive -ĠUn ited -Ġte chn -are nt -O R -Ġp ar -ĠD r -Ġ201 6 -r ist -er ing -Ġ  -Ġl arge -s ide -ac y -cc ess -Ġw in -Ġimport ant -Ġ19 9 -Ġdoes n -Ġ1 7 -Ġbus iness -Ġcle ar -Ġre se -" , -ur y -Ġe qu -as ter -al f -ĠAmeric an -n ect -Ġex pect -ivers ity -Ġo cc -ĠF l -Ġk ind -Ġme an -Ġp ast -Ġde v -Ġb as -le t -ra ft -Ġor gan -Ġde l -Ġper form -Ġst ory -Ġse ason -ĠC ol -Ġcl aim -Ġc ame -Ġwith in -Ġl ine -Ġpro ject -ĠA t -Ġcontro l -end ed -ĠS y -Ġa ir -iz ation -Ġ * -le y -Ġm oney -id d -Y ou -f or -Ġfam ily -Ġm aking -Ġb it -Ġpol ice -Ġhapp en -Ġ vers -on y -u ff -ĠW hen -Ġs it -ide o -l f -is on -Ġsu re -g in -Ġapp ear -Ġl ight -Ġ es -o f -Ġw ater -Ġt imes -n ot -Ġg row -Ġcomp any -ĠT e -ow s -Ġm ar -our ce -i ol -ar m -b r -Ġex ample -Ġcon c -Ġf ore -ĠT o -p ro -E N -ri es -Ġ2 5 -ĠC an -ne y -Ġact ually -Ġe ver -ur ity -ak en -ap s -Ġt ax -Ġm ajor -am a -Ġof ten -er al -Ġhum an -Ġj ob -is ter -Ġav ailable -oc r -en n -a id -iv id -Ġrec ord -? " -Ġs ing -ĠA m -id ence -Ġnew s -st er -Ġe conom -Ġfollow ing -ĠB r -is ing -Ġh our -m ost -um ent -Ġse x -Ġdes c -Ġbec ome -ĠE d -Ġto ok -Ġha ving -Ġprodu ct -a ult -A s -ar ing -Ġme ans -Ġh op -un e -Ġch o -Ġc ertain -Ġn on -Ġde al -2 4 -le ment -oc i -en e -Ġs ide -ĠP r -ĠM ay -Ġre ason -u ed -c hed -ul ation -Ġe lect -Ġoffic ial -Ġposs ible -Ġh old -and s -ot s -Ġc ity -or ies -Ġse ver -Ġchild ren -Ġon ce -Ġact iv -l er -Ġn ight -it ions -ĠJ ohn -a pe -pl ay -Ġd one -Ġl im -Ġwork ing -ĠP res -or ld -e b -ĠC o -Ġb ody -ail s -ut es -ĠM r -Ġwhe ther -Ġa uthor -ro p -Ġpro per -Ġse en -) ; -Ġf ac -ĠS u -Ġcon d -it ing -Ġcour se -Ġ } --------- -------- -a ign -Ġev ent -Ġen g -Ġp ot -Ġin tern -i am -Ġsh ort -em pt -ã Ĥ -ĠG od -il ar -8 0 -Ġor ig -I S -our n -ab ility -it ive -Ġd am -Ġ1 00 -Ġp ress -Ġdo ing -Ġprot ect -r ing -Ġthough t -Ġquest ion -re w -ĠW ar -Ġsever al -ĠSt ate -Ġg iven -Ġf und -ĠT w -Ġw ent -an ces -w ork -p or -m y -4 0 -Ġar g -art ment -ust om -Ġpol ic -Ġme et -Ġc reat -2 2 -ĠSt ates -Ġg ames -ra w -ut ure -Ġunder stand -ur s -ĠO b -l ish -s y -Ġm akes -Ġw on -ag on -Ġh tt -Ġl ove -ent ial -Ġcomple te -p ar -ĠI m -A L -Ġacc ount - ł -ore d -ver t -Ġ ident -Ġ201 5 -Ġother s -ĠM in -i ber -ver age -The re -ition al -d d -Ġpro b -Ġyou ng -Ġal ong -Ġacc ording -Ġy et -Ġmem bers -ĠWh at -o id -ĠM an -A nd -Ġam ong -a i -Ġem ploy -ĠR es -Ġ > -Ġinv ol -Ġl ow -a f -ĠC ar -Ġh ig -ĠO ne -ĠS ec -in ation -Ġlike ly -Ġan t -ag ed -ĠR uss -Ġb en -Ġre le -F or -b ack -ĠN ot -Ġpres ident -b all -Ġacc ess -ivid ual -ĠD em -ĠE uro -6 0 -Ġkn own -ir l -ĠG r -Ġear ly -u se -iet y -âĢ ĵ -Ġf ight -Ġs ent -Ġto day -Ġmark et -" . -Ġb ased -Ġstr ong -ur ther -Ġde b -m ber -Ġproble m -Ġde ath -Ġsoc ial -im ate -A S -ort un -Ġcamp aign -er y -C h -Ġe y -i ally -Ġm us -w h -p os -Ġ er -Ġsa f -Ġmonth s -ir on -Ġv iol -Ġf ive -Ġst re -Ġplay ers -in c -al d -y ear -a un -Ġsu ccess -Ġpres ent -ere nce -Ġ201 4 -Ġsu gg -Ġpartic ular -Ġtr y -Ġsugg est -ĠCh rist -on es -Ġpri v -2 3 -Ġc rit -Ġl and -Ġloc al -if y -2 9 -Ġa ut -E D -ĠG u -Ġm ult -Ġpolit ical -Ġask ed -Ġfor mer -it ter -ri pt -Ġcl ose -Ġp ract -ĠY ork -Ġget ting -Ġac ross -Ġcom b -Ġbelie ve -Ġ z -Ġto get -Ġtoget her -ĠC ent -ir c -Ġind ividual -ĠM c -2 7 -is k -ĠE ng -Ġf ace -Ġ2 4 -Ġval ue -Ġare a -e v -Ġw rit -ĠPres ident -Ġv ot -Ġke y -Ġm om -p ut -Ġany thing -Ġexper ience -att le -Ġm ind -a ff -om m -Ġf uture -g ed -Ġc ut -Ġto t -it ch -Ġv ideo -Ġinvest ig -Ġn et -ĠM y -r ict -i en -. ) -Ġimp ro -th ough -ward s -Ġcon nect -ĠM ed -sel ves -ens ive -m b -o ber -at ors -A n -Ġ5 0 -Ġre du -res ent -Ġab ove -Ġf re -ĠEuro pe -s w -Ġam ount -ĠA pp -Ġe ither -Ġmil it -Ġan al -Ġf ail -ĠE n -al es -Ġspec ial -Ġbl ack -I T -c her -Ġlook ing -Ġf ire -y n -Ġal most -o on -Ġstud y -Ġm iss -c hes -ro wn -Ġt re -Ġcommun ity -Ġmed ia -Ġf ood -Ġcom es -ĠUn iversity -Ġsing le -Wh at -u ly -Ġh alf -ag ue -h od -ĠRep ublic -Ġstart ed -Ġqu ick -ot o -b ook -Ġiss ue -it or -Ġel se -Ġcons ider -2 6 -ro du -Ġt aken -2 8 -9 9 -ĠW ith -Ġtr ue -Ġw a -Ġtr ad -Ġag o -Ġm ess -ie f -Ġadd ed -o ke -Ġb ad -Ġf av -3 3 -Ġsim ilar -as k -ĠD on -Ġcharact er -ort s -ĠH ouse -Ġreport ed -Ġty pe -v al -i od -ĠHow ever -Ġt arg -Ġent ire -pp ing -Ġhist ory -Ġl ive -ff ic -.... .... -ed eral -Ġtr ying -Ġdisc uss -ĠH ar -ac es -l ished -Ġse lf -os p -re st -Ġro om -el t -Ġf all -ol ution -Ġe t -Ġ x -Ġis n -Ġide a -b o -Ġs ound -ĠD ep -Ġsome one -ci ally -ull y -Ġf oc -Ġob ject -if t -ap er -Ġplay er -Ġr ather -Ġserv ice -as hing -ĠD o -ĠP art -ru g -m on -p ly -Ġm or -Ġnot hing -Ġprov ide -I C -un g -Ġpart y -Ġex ist -Ġm ag -7 0 -Ġr ul -Ġh ouse -Ġbeh ind -Ġhow ever -ĠW orld -Ġs um -Ġapp lic -Ġ ; -Ġfun ction -g r -ĠP ol -Ġfr ont -2 00 -Ġser ies -Ġt em -Ġty p -ill s -Ġo pt -Ġpoint s -Ġbel ow -itt ed -Ġspec ific -Ġ201 7 -um b -Ġr a -Ġpre vious -Ġpre t -re me -Ġc ustom -Ġcour t -ĠM e -Ġre pl -Ġwho le -g o -c er -Ġt reat -ĠA ct -Ġprob ably -Ġle arn -end er -ĠA ss -Ġvers ion -n ow -Ġche ck -ĠC al -R E -min ist -O n -our ces -Ġben ef -Ġd oc -Ġdet er -Ġen c -Ġsu per -Ġadd ress -Ġv ict -Ġ201 3 -Ġme as -t r -Ġf ield -W hen -Ġsign ific -u ge -Ġfe at -Ġcomm on -l oad -Ġbe gin -Ġbr ing -Ġa ction -er man -Ġdesc rib -Ġind ust -Ġwant ed -ri ed -m ing -Ġatt empt -4 5 -f er -Ġd ue -ress ion -# # -Ġsh all -Ġs ix -o o -Ġst ep -Ġp ub -Ġhim self -Ġ2 3 -Ġc op -Ġd est -Ġst op -A C -ib ility -Ġl ab -ic ult -Ġhour s -Ġcre ate -Ġf urther -ĠAmeric a -ĠC ity -Ġd ou -he ad -S T -ĠN orth -c ing -Ġn ational -u le -ĠIn st -Ġt aking -ĠQ u -ir t -Ġre d -Ġrese arch -v iron -ĠG e -Ġbre ak -an a -Ġsp ace -ater ial -Ġrec ent -ĠA b -Ġgener al -Ġh it -Ġper iod -Ġevery thing -ive ly -Ġph ys -Ġsay ing -an ks -Ġc ou -Ġc ult -ac ed -e al -u ation -Ġc oun -l u -Ġinclud e -Ġpos ition -ĠA fter -ĠCan ad -ĠE m -Ġim m -ĠR ed -Ġp ick -Ġcom pl -Ġm atter -re g -e xt -ang u -is c -o le -a ut -Ġcomp et -e ed -f ect -Ġ2 1 -ĠS en -ĠThe se -as ing -Ġcan not -Ġin it -Ġrel ations -ac hed -Ġb ar -Ġ4 0 -ĠT H -Ġ201 2 -Ġv ol -Ġg round -Ġsec urity -Ġup d -il t -3 5 -Ġconc ern -ĠJ ust -Ġwh ite -Ġseem s -ĠH er -pe cially -i ents -Ġann oun -Ġf ig -ight s -Ġst ri -l ike -id s -Ġs us -Ġw atch -Ġ â -Ġw ind -ĠC ont -Ġit self -Ġm ass -A l -y le -iqu e -ĠN ational -Ġab s -Ġp ack -Ġout side -Ġan im -Ġp ain -et er -Ġman ag -du ct -og n -Ġ ] -ĠSe pt -se c -o ff -ĠJ an -Ġf oot -ad es -Ġth ird -Ġm ot -Ġev idence -int on -Ġth reat -a pt -pl es -c le -Ġl o -Ġde cl -Ġit em -med i -Ġrep resent -om b -am er -Ġsignific ant -og raph -s u -Ġc al -i res -00 00 -I D -A M -Ġsim ply -Ġlong er -Ġf ile -O T -c he -S o -ate g -or g -ĠH is -Ġen er -Ġd om -Ġup on -il i -": " -Ġthem selves -Ġcom ing -Ġqu ite -Ġdiff icult -ĠB ar -il ities -re l -end s -c ial -6 4 -Ġwom an -ra p -y r -Ġne cess -ip s -Ġte xt -Ġrequ ire -Ġmilit ary -Ġre view -Ġresp ons -7 5 -Ġsub ject -Ġinst ead -Ġiss ues -Ġg en -" ," -Ġmin utes -Ġwe ap -r ay -am ed -t ime -b l -H ow -Ġc ode -ĠS m -Ġhig her -ĠSt e -r is -Ġp age -Ġstud ents -ĠIn tern -Ġmet hod -ĠA ug -ĠP er -ĠA g -Ġpolic y -ĠS w -Ġex ec -Ġac cept -um e -rib ut -Ġword s -Ġfin al -Ġchang es -ĠDem ocr -Ġfriend s -Ġres pect -Ġe p -Ġcomp an -iv il -Ġdam age -** ** -og le -viron ment -Ġne g -ent al -Ġa p -Ġtot al -iv al -! " -l im -Ġneed s -Ġag re -Ġdevelop ment -Ġa ge -ip le -2 1 -Ġresult s -ĠA f -S h -Ġg un -ĠOb ama -ro ll -Ġ @ -Ġright s -ĠB rit -Ġrun ning -Ġwas n -Ġp ort -Ġr ate -Ġpret ty -Ġtarg et -Ġsa w -Ġc irc -Ġwor ks -ic ro -al t -o ver -ww w -Th at -l ier -Ġevery one -ud e -Ġp ie -idd le -ra el -Ġr ad -Ġbl ock -Ġw alk -T o -ã ģ -n es -ĠA ust -a ul -ro te -ĠS outh -ess ion -op h -Ġshow s -Ġs ite -Ġj o -Ġr isk -cl us -l t -Ġin j -id ing -ĠS pe -Ġch all -ir m -Ġ2 2 -itt ing -st r -Ġh y -L E -ke y -Ġbe gan -at ur -ashing ton -l am -ĠD av -b it -Ġs ize -ĠP ar -3 8 -ourn al -f ace -Ġdec ision -Ġl arg -Ġj ud -re ct -Ġcontin ue -ĠO ct -ove red -ĠI nt -==== ==== -Ġp arent -ĠW ill -Ġeas y -Ġd rug -ang er -Ġs ense -Ġd i -id ay -Ġener gy -ist ic -Ġass oci -ar ter -ob al -e ks -ĠE l -ur ch -Ġg irl -o e -it le -Ġ2 8 -ĠC he -Ġrequ est -Ġso on -Ġh ost -k y -Ġst ates -om es -Ġm aterial -le x -Ġmom ent -Ġan sw -on se -Ġes pecially -Ġn orm -Ġserv ices -p ite -r an -Ġro le -4 4 -) : -Ġc red -C l -____ ____ -Ġm at -Ġl og -ĠCl inton -O U -Ġoff ice -Ġ2 6 -Ġch arg -Ġtr ack -m a -Ġhe art -Ġb all -Ġperson al -Ġbuild ing -n a -s et -b ody -ĠBl ack -Ġincre ase -itt en -Ġneed ed -3 6 -3 2 -= " -Ġl ost -Ġbec ame -Ġgrou ps -ĠM us -Ġw rote -ĠP e -Ġpro p -j oy -à © -ĠWh ite -Ġde ad -. ' -Ġhtt p -Ġwe bs -O S -Ġins ide -Ġwr ong -Ġstat ement -Ġ ... -y l -Ġfil m -Ġmus ic -Ġsh are -ific ation -Ġre lease -Ġfor ward -Ġst ay -Ġcomp ut -it te -s er -Ġorig inal -Ġc ard -Ġc and -Ġd iv -at ural -Ġfav or -O M -Ġc ases -us es -Ġse ction -Ġle ave -g ing -ov ed -ĠW ashington -3 9 -ĠG l -Ġrequ ired -act ion -ap an -o or -it er -ĠK ing -Ġcount ries -ĠG erman -ll ing -Ġ2 7 -3 4 -Ġquest ions -Ġpr im -Ġc ell -Ġsh oot -Ġany one -ĠW est -Ġaff ect -ep end -Ġon line -ĠIs rael -ĠSept ember -Ġab ility -Ġcont ent -is es -Ġre ve -Ġl aun -Ġind ic -Ġfor ce -c ast -Ġso ld -av ing -f l -Ġso ft -Ġcompan ies -ce ed -Ġart icle -Ġa ud -Ġre v -Ġed uc -Ġplay ing -0 5 -Ġhe ld -ct or -Ġrele ased -Ġf ederal -3 7 -Ġad minist -Ġinter view -Ġinst all -Ġrece ived -Ġs ource -u k -P h -Ġser ious -Ġcre ated -Ġc ause -Ġim medi -Ġdef in -u el -ĠDep artment -ct ions -ĠC our -ĠN ow -z e -it es -it ution -Ġl ate -Ġspe ak -n ers -Ġleg al -ar i -ĠC or -Ġwe eks -Ġmod el -Ġp red -Ġex act -B C -ĠB y -IN G -os ing -Ġt akes -Ġreg ard -Ġopp ortun -Ġpr ice -Ġ19 8 -ĠA pr -f ully -Ġor d -Ġproble ms -ru ction -h am -ĠC ount -le ge -Ġlead ers -E T -le v -Ġde ep -olog ical -es e -h aps -ĠS ome -Ġp ers -Ġcont ract -Ġrelations hip -s p -ou d -Ġb ase -4 8 -m it -A d -anc ial -Ġcons um -Ġpot ential -Ġl angu -re m -et h -Ġrel ig -ress ed -6 6 -Ġl ink -Ġl ower -ay er -ĠJ une -Ġf em -un t -er c -ur d -Ġcont act -Ġ ill -Ġm other -Ġest ab -h tt -ĠM arch -ĠB ro -ĠCh ina -Ġ2 9 -Ġs qu -Ġprov ided -Ġa verage -as ons -Ġ201 1 -Ġex am -l in -5 5 -n ed -Ġper fect -Ġt ou -al se -u x -Ġbu y -Ġsh ot -Ġcol lect -Ġph ot -Ġplay ed -Ġsur pr -Ġofficial s -Ġsim ple -av y -Ġindust ry -Ġhand s -g round -Ġp ull -Ġr ound -Ġus er -Ġr ange -u ary -Ġpriv ate -op s -e es -Ġw ays -ĠM ich -Ġve h -Ġex cept -Ġter ms -im um -pp er -I ON -ore s -ĠDr agon -ou l -Ġd en -Ġperform ance -Ġb ill -c il -4 7 -Ġen vironment -Ġex c -ad d -Ġwor th -Ġp ict -Ġch ance -Ġ201 8 -b or -Ġspe ed -ict ion -Ġal leg -ĠJ apan -at ory -re et -Ġm atch -ĠI I -Ġst ru -ord er -Ġst e -Ġl iving -Ġst ruct -in o -Ġse par -her n -Ġresp onse -Ġen joy -Ġv ia -A D -um ents -ace book -Ġmem ber -ib r -iz ing -Ġto ol -ĠM on -ĠWh ile -h ood -ĠA ng -ĠD ef -Ġoff er -T r -a ur -Ġturn ed -ĠJ uly -d own -an ced -Ġrec ently -ĠE ar -Ġc e -ĠSt ar -ĠC ong -rough t -Ġbl ood -Ġhop e -Ġcom ment -ain t -Ġar ri -il es -Ġpartic ip -ough t -ri ption -0 8 -4 9 -Ġg ave -Ġse lect -Ġkill ed -sy ch -Ġgo es -i j -Ġc oll -Ġimp act -at ives -ĠS er -0 9 -ĠAug ust -Ġb oy -d e -ĠD es -Ġf elt -U S -Ġexpect ed -Ġim age -ĠM ark -cc ording -o ice -E C -ĠM ag -en ed -h old -ĠP ost -Ġpre vent -N o -Ġinvol ved -Ġey es -Ġquick ly -A t -un k -Ġbeh av -Ġ ur -Ġl ed -c ome -e y -Ġcand id -Ġear lier -Ġfoc us -et y -P ro -led ge -ix ed -ill ed -Ġpop ular -A P -Ġset t -l ight -Ġvar ious -in ks -Ġlevel s -Ġro ad -ell ig -ab les -he l -itte e -ĠG ener -y pe -Ġhe ard -ic les -Ġm is -Ġus ers -ĠS an -Ġimpro ve -Ġf ather -Ġse arch -The y -v il -Ġprof ess -Ġkn ew -Ġl oss -Ġev ents -6 5 -Ġb illion -0 7 -0 2 -ĠNew s -ĠA M -Ġco ver -w here -ens ion -Ġb ott -Ġare as -en ces -op e -ĠTw itter -a el -Ġget s -ĠGo ogle -Ġs n -i ant -Ġv ote -Ġnear ly -Ġinclud ed -Ġrec ogn -z z -m m -al ed -Ġhappen ed -0 4 -Ġh ot -Ġwho se -Ġc ivil -Ġsu ff -o es -it iz -ĠSy ri -Ġresp ond -Ġh on -Ġfeat ures -Ġeconom ic -ĠApr il -r im -Ġtechn ology -Ġo ption -ag ing -Ġpur ch -R e -Ġl at -ch ie -is l -Ġrec omm -u f -Ġtr aining -Ġeffect s -Ġf ast -Ġ201 0 -Ġocc ur -Ġwebs ite -Ġem ail -Ġs ens -e ch -Ġo il -Ġinf lu -Ġcurrent ly -ĠS ch -ĠAd d -Ġgo al -Ġsc ient -Ġcon v -1 00 -em y -Ġdec ided -Ġtra vel -Ġm ention -L L -0 3 -Ġe lection -Ġph one -Ġlook s -Ġsit uation -Ġc y -Ġh or -b ed -ĠCour t -a ily -av es -Ġqu ality -ĠCom p -w ise -Ġt able -Ġst aff -ĠW ind -et t -Ġtri ed -ide red -Ġadd ition -Ġb ox -Ġl ack -ar ily -Ġw ide -Ġm id -Ġbo ard -ys is -Ġant i -h a -Ġd ig -en ing -Ġd ro -C on -6 8 -Ġsl ow -b ased -se qu -Ġp ath -E x -ak er -Ġwork ed -Ġp en -Ġeng ine -Ġlook ed -ĠSu per -ĠS erv -Ġvict im -U n -Ġproper ty -Ġint rodu -Ġexec ut -ĠP M -L e -Ġcol or -ĠM ore -Ġ6 0 -Ġnet work -Ġd ate -c ul -id ge -Ġext ra -3 1 -Ġs le -6 7 -Ġw ond -Ġreport s -j ust -ĠAust ral -Ġcap ital -Ġen s -Ġcomm and -Ġallow ed -Ġpre p -Ġca pt -h ib -Ġnum bers -ch an -Ġf air -m p -om s -Ġre ach -W ith -t ain -Ġbro ad -Ġcou ple -ec ause -ly ing -ĠF eb -Ġsc reen -Ġl ives -Ġpri or -ĠCong ress -A r -Ġappro ach -Ġe mer -ar ies -ĠD is -s erv -ĠN e -Ġbu ilt -c ies -Ġre pe -Ġrul es -for ce -ĠP al -Ġfin ancial -Ġcons idered -ĠCh ar -n ces -ĠI S -Ġb rought -Ġb i -i ers -ĠS im -O P -Ġproduct s -Ġvis it -Ġdoc ument -Ġcon duct -Ġcomplete ly -in ing -ĠCal if -ib ly -Ġwr itten -ĠT V -em ents -Ġd raw -O ne -Ġpub lished -Ġsec ret -r ain -he t -ĠF acebook -ond ay -ĠU p -Ġsex ual -Ġth ous -ĠP at -Ġ ess -Ġstand ard -Ġar m -g es -ect ion -Ġf ell -Ġfore ign -an i -ĠFr iday -Ġreg ular -in ary -Ġincre ased -Ġus ually -Ġdem on -Ġd ark -Ġadd itional -ro l -ĠO f -Ġprodu ction -! ! -und red -Ġintern ational -id ents -ĠF ree -rou p -Ġr ace -Ġm ach -Ġh uge -A ll -le ar -ove mber -Ġto wn -Ġatt ention -ĠO ff -y ond -ĠThe n -f ield -Ġter ror -ra z -ĠB o -Ġmeet ing -ĠP ark -Ġar rest -Ġf ear -Ġa w -ĠV al -or ing -' , -Ġext reme -ar r -Ġwork ers -A fter -Ġ3 1 -n et -am ent -Ġdirect ly -Ġpop ulation -ub e -ĠOct ober -ĠI N -ĠJan uary -5 9 -ĠDav id -Ġc ross -ce mber -ĠF irst -Ġmess age -ir it -Ġn ation -Ġp oll -is ions -Ġansw er -n y -is ode -Ġcar ry -ĠRuss ia -Ġhe ar -eng th -ro y -Ġn atural -in ally -Ġdo g -m itted -Ġtr ade -Ġsub st -Ġmult iple -ĠAf ric -Ġf ans -Ġs ort -Ġgl obal -ic ation -ĠW ed -ar a -Ġa chie -Ġlangu age -ve y -Ġt al -Ġnecess ary -Ġdet ails -Ġs en -ĠS und -ĠRe g -ĠR ec -0 6 -Ġs il -ress ive -Ġmed ical -un ch -orn ia -Ġu nd -f ort -oc ks -ĠM onday -ues day -c raft -7 7 -ur t -Ġ ver -ĠH ill -Ġrece ive -Ġmor ning -es tern -Ġb ank -Ġs at -ir th -ĠH igh -Ġdev ice -ĠTH E -ĠCent er -Ġsaf e -Ġp le -ĠCanad a -Ġsystem s -Ġass ist -Ġsur v -Ġb attle -ĠS oc -vert is -S he -Ġp aper -Ġgrow th -Ġc ast -S c -Ġpl ans -ll ed -Ġpart s -Ġw all -Ġmove ment -Ġpract ice -im ately -Ġdis play -Ġsomet imes -om p -ĠP aul -ĠY es -k ing -5 8 -o ly -Ġs on -Ġav oid -ok es -ĠJ ew -Ġto wards -as c -Ġ // -ĠK ore -Ġtalk ing -Ġcor rect -Ġsp ent -ic ks -i able -e ared -Ġter m -Ġwant s -om ing -Ġ ut -Ġdou b -Ġfor ces -Ġp lease -6 9 -ĠN ovember -at form -ond on -Ġon es -Ġimmedi ately -ĠRuss ian -ĠM et -Ġde g -Ġparent s -C H -ĠAmeric ans -al y -ĠM od -Ġsh own -Ġcond itions -Ġst uff -Ġre b -ĠY our -Ġinclud es -n own -ĠS am -Ġexper ien -m ission -ĠE ven -augh t -Ġannoun ced -ĠRepublic an -Ġdeter min -Ġdescrib ed -ĠCount y -( ) -Ġdo or -Ġchang ed -Ġne igh -ĠH ere -Ġcle an -Ġp an -ĠDe cember -ĠEurope an -ir ing -ap ter -Ġcl ub -ĠT uesday -Ġp aid -ĠN et -Ġattack s -Ġcharact ers -Ġal one -Ġdirect or -d om -Ġ3 5 -Ġl oad -Ġr out -ĠCalif ornia -Ġfin ally -Ġr ac -Ġcont r -Ġexact ly -res h -p ri -ĠIs lam -Ġn ature -Ġcare er -Ġlat est -Ġcon vers -ĠS l -p ose -ci ent -ĠIn c -iv ity -8 8 -ĠA tt -ĠM or -nes day -Ġwe ight -k en -Ġnot e -Ġteam s -Ġ \ -air s -ĠG reen -Ġh undred -on ent -Ġstre ng -Ġcons ist -ic ated -Ġreg ul -Ġl ic -ast ic -Ġt en -urs day -ellig ence -ous ly -ĠU K -B I -Ġcost s -Ġind epend -ĠA P -Ġnorm al -Ġh om -Ġob vious -Ġs we -Ġst ar -Ġread y -ac her -Ġimp lement -g est -Ġs ong -ĠG et -ĠL ab -Ġinterest ing -us ing -Ġg iving -ĠSund ay -Ġet c -Ġm iddle -Ġrem ember -r ight -os ition -ut ions -Ġm ax -4 6 -Ġyour self -Ġdem and -Ġtreat ment -Ġd anger -ĠC ons -Ġgu y -ĠBrit ish -Ġphys ical -Ġrel ated -Ġrem ain -Ġcould n -Ġref er -Ġc itiz -b ox -EN T -bo ard -Ġin n -I G -er o -ĠSt reet -osp ital -ren ch -cher s -Ġst ra -O L -ag er -ĠA N -Ġeas ily -I A -en ge -in y -Ġcl os -ock ed -Ġus es -ĠC oun -I m -u ild -? ? -m ore -Ġan g -Ġwr ite -ol ute -5 7 -Ġlead er -Ġread ing -< / -Ġaut om -est s -4 3 -Ġleg isl -ĠG old -Ġdesign ed -ĠS T -ĠLe g -a res -Ġbe aut -ĠT ex -Ġappear s -Ġstru gg -ĠR om -Ġ 00 -Ġcho ice -Ġparticular ly -ĠF rom -op er -ĠL ondon -ann ed -Ġallow s -ob ile -Ġdiffere nce -âĢ ¢ -ĠV iew -ĠWed nesday -Ġal though -Ġrel ative -Ġapplic ation -ate ver -Ġare n -Ġmy self -Ġim ag -Ġdis e -Ġsoc iety -Ġfre qu -ĠEng lish -Ġpo or -ĠD ay -Ġwrit ing -Ġse ven -Ġstart ing -Ġb ud -Ġpr int -ĠTr ans -uf act -ĠSt ud -n ew -Ġcr im -Ġg ives -Ġco ol -a e -i ance -ĠGener al -Ġthink ing -Ġsa ve -Ġlim ited -ĠPart y -Ġmean ing -p en -ow ers -ĠJ ack -E M -Ġn ice -ru pt -Ġg as -Ġe ight -Ġfe et -Ġeff ort -Ġ ign -ic it -B l -co in -Ġop in -Ġbr ain -Wh ile -he st -ĠTh ursday -Ġwould n -augh ter -Ġtou ch -le ments -Ġstud ies -Ġcent er -c ont -or ge -Ġcomput er -Ġinvestig ation -P l -or ks -Ġ200 8 -Ġincre asing -Ġst ore -Ġcom ments -Ġb al -m en -Ġdo ll -Ġl iber -Ġw ife -Ġlaw s -atur day -it ness -Ġmod ern -ĠS k -Ġadminist ration -Ġopportun ity -Ġs al -Ġpower ful -M y -Ġclaim s -ĠEar th -ord s -Ġt itle -Ġes c -n ame -N ot -om en -Ġbe yond -Ġc amer -Ġse ll -it ute -ear ch -Ġapp l -im ent -4 2 -ĠAr t -Ġun f -Ġviol ence -ur g -ĠE ast -Ġcomp ared -Ġopt ions -Ġthrough out -Ġv s -ig r -. [ -ac hes -7 8 -Ġfil es -F L -E L -ar ian -ĠJ ames -ĠA ir -an ch -Ġdet ail -Ġpie ce -P S -Ġn amed -Ġeduc ation -Ġdri ve -Ġitem s -Ġstud ent -ic ed -: : -ic o -Ġth row -Ġsc ene -Ġcomple x -Ġ200 9 -Ġpre c -ĠB re -7 9 -Ġcon cept -Ġstat us -am ing -Ġd ied -Ġknow ledge -Ġbegin ning -O D -ru ary -Ġcertain ly -Ġgu ys -Ġsl ight -in n -ound s -Ġf ine -Ġf at -ic ations -Ġper haps -ĠA nt -Ġinc ome -Ġhtt ps -Ġmajor ity -port s -st on -Ġgreat er -Ġfe ed -ent ially -Ġsaf ety -Ġun ique -and om -Ġg one -Ġshow ed -Ġhist or -Ġcoun ter -i us -id a -Ġlead ing -i pe -Ġs end -ĠDon ald -er ve -Ġdef ense -ines e -Ġy es -ĠF ire -ĠMus lim -ra q -Ġcontin ued -os h -Ġprov ides -Ġpr ison -ĠP re -Ġhapp y -Ġeconom y -Ġtr ust -ag s -ĠG ame -Ġweap ons -um an -ĠC le -it ation -Ġanal ysis -ĠT imes -Ġsc ience -- > -Ġfig ure -Ġdis app -ent y -Ġsoft ware -Ġu lt -Ġoffic ers -N ew -I s -Ġrem ains -ĠInd ia -Ġp sych -ri ef -Ġc at -es c -Ġob serv -Ġst age -ĠD ark -Ġent er -ch ange -Ġpass ed -Ġdes pite -ĠO ut -Ġmov ie -r s -Ġv oice -m ine -ĠPl ay -Ġto ward -ĠT er -Ġreg ion -Ġval ues -or ters -Ġm ount -Ġoffic er -ĠO ther -b an -Ġh ous -w ood -ro om -I V -ĠS un -se e -ĠO ver -ro g -9 0 -Ġl ay -ĠT ur -a wn -Ġpress ure -ĠS ub -Ġbook s -ed om -ĠS and -A A -ag o -Ġre asons -f ord -Ġactiv ity -U T -N ow -ĠSen ate -ce ll -n ight -Ġcall s -in ter -Ġlet ter -ĠR ob -ĠJ e -Ġcho ose -ĠL aw -G et -B e -Ġro b -Ġtyp es -Ġpl atform -Ġqu arter -R A -ĠT ime -Ġmay be -ĠC r -9 5 -p re -Ġmov ing -Ġl if -Ġgo ld -Ġs om -Ġpat ients -Ġtr uth -ĠK e -ur ance -ant ly -m ar -Ġchar ge -ĠG reat -Ġce le ----------------- ---------------- -Ġro ck -ro id -an cy -Ġcred it -a ud -B y -ĠE very -Ġmov ed -ing er -rib ution -Ġn ames -Ġstra ight -ĠHe alth -ĠW ell -Ġfe ature -Ġr ule -Ġsc he -in ated -ĠMich ael -ber g -4 1 -il ed -b and -Ġcl ick -ĠAng el -on ents -Â Ń -ĠI raq -ĠS aturday -Ġa ware -p art -Ġpat tern -O W -ĠL et -Ġgr ad -ign ed -Ġassoci ated -Ġst yle -n o -i ation -a ith -il ies -Ġst ories -ur ation -Ġindividual s -ĠâĢ ¦ -m iss -ĠAss oci -ish ing -ab y -Ġsum mer -ĠB en -Ġ3 2 -Ġar ch -ut y -ĠTex as -h ol -Ġfull y -Ġm ill -Ġfollow ed -ĠB ill -ĠInd ian -ĠSec ret -ĠB el -ĠFeb ruary -Ġjob s -Ġseem ed -ĠGo vern -i pped -Ġreal ity -Ġl ines -Ġp ark -Ġmeas ure -ĠO ur -I M -Ġbro ther -Ġgrow ing -Ġb an -Ġest im -Ġc ry -ĠS chool -Ġme chan -ĠO F -ĠWind ows -Ġr ates -ĠO h -Ġpos itive -Ġcult ure -ist ics -ic a -Ġh ar -y a -ite ly -i pp -Ġm ap -en cies -ĠWill iam -I I -ak ers -5 6 -ĠM art -ĠR em -Ġal tern -it ude -Ġco ach -row d -D on -Ġk ids -Ġj ournal -Ġcor por -Ġf alse -Ġwe b -Ġsle ep -Ġcont ain -Ġst o -Ġb ed -iver se -ĠR ich -ĠCh inese -Ġp un -Ġme ant -k nown -Ġnot ice -Ġfavor ite -a ven -Ġcond ition -Ġpur pose -) ) -Ġorgan ization -Ġchall eng -Ġman ufact -Ġsus p -ĠA c -Ġcrit ic -un es -uc lear -Ġm er -vent ion -Ġ8 0 -Ġm ist -ĠU s -ĠT or -htt p -ol f -Ġlarg er -Ġadv ant -Ġrese ar -Ġact ions -m l -Ġke pt -Ġa im -, ' -c ol -Ġbenef its -if ying -Ġact ual -ĠIntern ational -Ġveh icle -Ġch ief -Ġeff orts -ĠLe ague -ĠM ost -Ġwa it -Ġad ult -Ġover all -Ġspe ech -Ġhigh ly -Ġfem ale -Ġer ror -Ġeffect ive -5 4 -Ġenc our -w ell -Ġfail ed -Ġcons erv -Ġprogram s -Ġt rou -Ġa head -5 00 -vertis ement -I P -ĠF ound -p ir -Ġ % -Ġcr ime -and er -Ġloc ation -ĠI ran -Ġbehav ior -az ing -Ġr are -Ġem b -Ġca used -Ġsh ip -Ġact ive -Ġcont ribut -Ġg reen -Ġac qu -Ġref lect -ven ue -Ġf irm -Ġb irth -] . -Ġclear ly -Ġem ot -Ġag ency -ri age -Ġmem ory -9 8 -S A -ĠSe e -ac ing -C C -Ġbig gest -Ġr ap -Ġbas ic -Ġb and -e at -Ġsus pect -ĠM ac -Ġ9 0 -m ark -ist an -Ġsp read -am s -k i -as y -ra v -ĠR ober -Ġdemon str -r ated -Ġabs olute -Ġpl aces -Ġim pl -ibr ary -Ġc ards -Ġdest roy -Ġv irt -ve re -Ġapp eared -y an -p oint -Ġbe g -Ġtem per -s pe -ant ed -ear s -ĠD irect -Ġl ength -Ġbl og -am b -Ġint eg -Ġres ources -ac c -if ul -Ġsp ot -Ġfor ced -Ġthous ands -ĠMin ister -Ġqu al -ĠF rench -at ically -Ġgener ally -Ġdr ink -Ġth us -I L -od es -Ġappro pri -ĠRe ad -Ġwh om -Ġey e -Ġcol lege -Ġ4 5 -ire ction -Ġens ure -Ġapp arent -id ers -Ġrelig ious -Ġmin or -ol ic -Ġt ro -ĠWh y -rib ute -m et -Ġprim ary -Ġdevelop ed -Ġpe ace -Ġsk in -st e -av a -Ġbl ue -Ġfam ilies -Ġ ir -Ġapp ly -Ġin form -ĠSm ith -C T -i i -Ġlim it -Ġres ist -........ ........ -um n -Ġconf lic -Ġtw e -ud d -ĠT om -Ġl iter -qu e -b on -Ġha ir -Ġevent ually -Ġp us -Ġhelp ed -Ġag g -or ney -ĠApp le -Ġf it -ĠS ur -Ġpre m -Ġs ales -Ġsecond s -Ġstreng th -Ġfeel ing -¿ ½ -Ġt our -Ġknow s -o om -Ġex erc -Ġsom ew -ï ¿½ -> > -Ġsp okes -Ġide as -Ġreg ist -so ft -ĠD el -ĠP C -Ġpro pos -Ġlaun ch -Ġbott om -T H -ĠP lease -v est -it z -ĠIn ter -Ġsc ript -Ġr at -ar ning -Ġ il -ĠJ er -ĠA re -Ġwh atever -ok en -ci ence -Ġmod e -Ġag ree -Ġs ources -Ġinit ial -Ġrest rict -Ġwond er -us ion -## ## -ĠS il -vil le -Ġb urn -t w -as ion -Ġ £ -Ġn or -u ing -Ġre ached -Ġs un -Ġc ateg -ig ration -Ġc ook -Ġprom ot -Ġm ale -Ġcl imate -Ġf ix -Ġalleg ed -U R -all ed -Ġim ages -C ont -ot a -Ġschool s -i os -Ġd rop -Ġst ream -ĠM o -Ġprevious ly -al ing -Ġp et -Ġdou ble -Ġ( @ -ann el -Ġdef ault -t ies -Ġr ank -ĠD ec -ĠCoun cil -Ġweap on -Ġst ock -Ġanal y -ĠSt r -Ġpict ure -ĠPol ice -f erence -Ġcent ury -Ġcitiz ens -Ġon to -Ġexp and -Ġhe ro -ĠS ol -Ġw ild -Ġupd ate -Ġcustom ers -r ont -d ef -Ġl ik -Ġcrim inal -ĠChrist ian -S P -7 6 -Ġle aving -Ġother wise -ĠD ist -Ġbas is -5 2 -5 3 -ic ip -ĠB er -Ġrecomm end -Ġfl oor -Ġc rowd -ol es -Ġ7 0 -Ġcent ral -ĠE v -Ġd ream -Ġdown load -Ġconf ir -ĠTh om -Ġwind ow -Ġhapp ens -Ġun it -Ġt end -Ġs pl -Ġbec omes -Ġfight ing -Ġpred ict -ĠP ress -ĠP ower -Ġhe avy -ak ed -Ġf an -or ter -ate gy -B A -iz es -Ġsp end -H ere -Ġ200 7 -Ġad op -ĠH am -Ġfoot ball -ĠP ort -od ay -5 1 -amp ions -Ġtrans fer -h t -Ġ3 8 -ter m -ac ity -Ġb ur -] , -tern al -r ig -b ut -Ġthere fore -ĠB ecause -res p -re y -Ġm ission -S ome -Ġnot ed -Ġass um -Ġdise ase -Ġed it -Ġprog ress -r d -ĠB rown -oc al -Ġadd ing -Ġra ised -ĠAn y -Ġt ick -Ġsee ing -ĠPe ople -Ġagre ement -Ġser ver -Ġw at -Ġdeb ate -Ġsupp osed -il ing -Ġlarg est -Ġsuccess ful -ĠP ri -ĠDemocr atic -Ġj ump -ĠSyri a -Ġown ers -Ġoff ers -Ġshoot ing -Ġeff ic -se y -Ġha ven -ver se -te red -ĠL ight -im al -ĠB ig -Ġdef end -Ġbe at -Ġrecord s -% ) -Ġsc en -Ġemploy ees -Ġdev ices -he m -Ġcom mer -ĠM ex -Ġbenef it -ĠPro f -Ġil leg -Ġsur face -ĠAl so -Ġh arm -ing ly -w ide -ĠA lex -Ġsh ut -ĠC ur -Ġl ose -p m -Ġchall enge -se mb -Ġst ation -Ġint elligence -Ġacc ur -ĠFl or -Ġrequ ires -ĠM al -b um -Ġh ospital -Ġsp irit -Ġoff ered -Ġprodu ce -ĠComm un -Ġcreat ing -Ġcr is -s pect -Ġend ed -Ġd aily -Ġvot ers -land s -i as -i h -on a -Ġsm art -ĠOff ice -ĠL ord -ri al -ĠIntern et -Ġcirc um -Ġextreme ly -' . -Ġopin ion -ĠM il -Ġg ain -B S -ĠF in -y p -Ġuse ful -Ġbud get -Ġcom fort -is f -Ġback ground -el ine -Ġep isode -Ġen emy -Ġtri al -Ġestab lish -d ate -ĠC ap -Ġcontin ues -Ġshow ing -ĠUn ion -w ith -Ġpost ed -ĠSy stem -Ġe at -ri an -Ġr ise -ĠGerman y -il s -Ġsign ed -Ġv ill -Ġgr and -m or -ĠEng land -Ġproject s -um ber -Ġconf erence -z a -Ġrespons ible -ĠAr ab -Ġlearn ed -âĢĶ âĢĶ -i pping -ĠGe orge -O C -Ġreturn ed -ĠAustral ia -Ġb rief -Q u -Ġbr and -ill ing -ab led -Ġhig hest -Ġtr ain -ĠComm ission -wh ile -Ġn om -cept ion -Ġm ut -ĠBl ue -Ġinc ident -v ant -8 6 -ĠI D -Ġn uclear -7 4 -ĠL ike -ĠR E -ĠM icro -l i -m ail -Ġcharg es -8 9 -Ġad just -ad o -Ġear th -N A -Ġpr ices -P A -Ġd raft -Ġrun s -Ġcandid ate -ens es -Ġmanag ement -ĠPh il -ĠM iss -Ġte ach -g ram -Ġunderstand ing -a it -ic ago -A dd -ĠE p -sec ut -Ġsepar ate -Ġinst ance -Ġe th -Ġun less -**** **** -ĠF ore -in ate -Ġoper ations -S p -Ġf aith -g ar -ĠCh urch -ron ic -Ġconf ig -os ure -Ġactiv ities -Ġtrad itional -Ġ3 6 -Ġd irection -Ġmach ine -Ġsur round -Ġp ush -un ction -ĠE U -Ġeas ier -Ġarg ument -G B -Ġm icro -Ġsp ending -iz ations -Ġthe ory -ad ow -Ġcall ing -ĠL ast -Ġd er -Ġinflu ence -Ġcomm it -Ġph oto -Ġun c -ist ry -g n -ast e -ack s -Ġdis p -ad y -d o -ĠG ood -Ġ ` -Ġw ish -Ġreve aled -Âł Âł -l ig -Ġen force -ĠComm ittee -Ġche m -Ġmil es -Ġinterest ed -Ġsol ution -ic y -in ct -Ġ- > -ĠD et -Ġrem oved -Ġcomp ar -e ah -Ġpl ant -ĠS ince -Ġachie ve -Ġadvant age -Ġslight ly -b ing -Ġpl aced -u nder -201 5 -ĠM ad -Ġt im -os es -Ġc ru -ĠR ock -Ġmost ly -Ġneg ative -Ġset ting -Ġprodu ced -Ġm ur -Ġconnect ion -ĠM er -Ġdri ver -Ġexecut ive -Ġass ault -Ġb orn -ĠV er -t ained -Ġstruct ure -Ġredu ce -Ġdec ades -Ġd ed -u ke -ĠM any -idd en -Ġle ague -S e -Ġjo in -Ġdis co -Ġd ie -c ks -act ions -Ġass ess -ag n -Ġgo als -our s -I R -Ġsen ior -ill er -m od -ip ment -oc ol -u y -ĠQ ue -Ġpart ies -ir gin -Ġle arning -it able -Ġstre et -Ġcamer a -A pp -Ġsk ills -b re -c ious -Ġcele br -ĠFr anc -Ġexist ing -Ġwill ing -l or -Ġ id -ĠSp ace -Ġcrit ical -ĠL a -ortun ately -Ġser ve -Ġc old -Ġspec ies -T S -Ġanim als -ĠB ay -Ġold er -ĠU nder -est ic -ĠT re -Ġte acher -Ġpre fer -v is -Ġth read -ĠM att -Ġmanag er -ãĥ » -Ġprofess ional -ĠV ol -Ġnot es -The se -ul a -Ġf resh -ent ed -u zz -ed y -clus ion -ĠR el -Ġdoub t -E O -Ġopen ed -ĠB it -Ad vertisement -Ġgu ess -ĠU N -Ġse qu -Ġexpl ain -ott en -Ġatt ract -ak s -Ġstr ing -Ġcont ext -oss ible -ĠRepublic ans -Ġsol id -Ġc ities -Ġask ing -Ġr andom -u ps -ur ies -ar ant -dd en -g l -ĠFlor ida -Ġdep end -ĠSc ott -Ġ3 3 -Ġi T -ic on -Ġmention ed -Ġ2 000 -Ġclaim ed -Ġdefin itely -ul f -Ġc ore -Ġopen ing -ĠCon st -wh ich -ĠT ra -A G -7 2 -Ġbelie ved -ad a -Ġ4 8 -ĠSec urity -yr ight -ĠP et -ĠL ou -Ġhold ing -======== ======== -Ġ ice -Ġb row -Ġauthor ities -h ost -w ord -Ġsc ore -ĠD iv -Ġcell s -Ġtrans l -Ġneigh bor -Ġrem ove -u ct -Ġdist rict -ĠA ccording -Ġwor se -Ġconcern s -Ġpresident ial -Ġpolic ies -ĠH all -7 3 -Ġh us -A Y -Ġ200 6 -ĠJ ud -Ġindepend ent -ĠJust ice -ili ar -pr int -igh ter -Ġprotect ion -z en -Ġsu dden -h ouse -ĠJ es -P R -ĠIn f -Ġb ul -Ġ _ -ĠServ ice -ĠP R -Ġstr ategy -ff ect -Ġgirl s -Ġmiss ing -oy al -ĠTe am -ul ated -Ġd at -Ġpolit ics -ab or -A ccording -Ġspe ll -Ġg raph -ort hern -T C -A b -Ġlab or -is her -Ġk ick -ĠiT unes -Ġstep s -pos es -Ġsmall er -E n -ber t -Ġro ll -Ġresear chers -Ġcl osed -Ġtrans port -Ġlaw y -________ ________ -ĠCh icago -Ġas pect -Ġn one -Ġmar riage -9 6 -Ġe lements -ĠF re -ĠS al -Ġd ram -F C -t op -e qu -Ġhe aring -Ġsupport ed -Ġtest ing -co hol -Ġmass ive -Ġst ick -Ġgu ard -is co -ph one -F rom -How ever -Ġb order -Ġcop y -ograph y -l ist -7 1 -Ġown er -cl ass -ru it -r ate -ĠO nce -Ġdig ital -Ġt ask -ER S -Ġinc red -t es -+ + -ĠFr ance -Ġb reat -ow l -Ġiss ued -ĠW estern -Ġdet ect -Ġpart ners -Ġsh ared -ĠC all -Ġcan cer -ac he -rib e -Ġexpl ained -Ġhe at -{ " -Ġinvest ment -ĠB ook -Ġw ood -Ġtool s -ĠAl though -Ġbelie f -Ġcris is -Ġg e -ĠM P -Ġoper ation -ty pe -~ ~ -g a -Ġcont ains -ant a -Ġexp ress -ĠG roup -ĠJ ournal -k a -Ġam b -ĠUS A -Ġfind ing -Ġfund ing -h ow -Ġestab lished -ide os -Ġdeg ree -Ġdanger ous -ang ing -Ġfre edom -pp ort -out hern -Ġch urch -Ġc atch -ĠTw o -Ġpres ence -ĠGu ard -U p -Ġauthor ity -ĠPro ject -Ġbut ton -Ġcon sequ -Ġval id -Ġwe ak -Ġstart s -Ġref erence -ĠM em -" ) -U N -or age -ĠO pen -Ġcol lection -y m -g ency -Ġbeaut iful -ro s -Ġtell s -Ġwa iting -n el -Ġprov iding -ĠDemocr ats -Ġd aughter -Ġm aster -Ġpur poses -ĠJapan ese -Ġequ al -Ġturn s -Ġdoc uments -Ġwatch ing -R es -Ġr an -201 4 -Ġre ject -ĠKore a -Ġvictim s -Le vel -ere nces -Ġw itness -Ġ3 4 -Ġre form -com ing -Ġocc up -Ġc aught -Ġtra ffic -ad ing -Ġmod els -ar io -Ġserv ed -Ġb atter -u ate -ĠSecret ary -Ġagre ed -Ġtr uly -yn am -ĠR et -Ġun its -ĠRes earch -h and -az ine -ĠM ike -Ġvar iety -ot al -Ġam azing diff --git a/tests/data/t0/gpt2-tiny-vocab.json b/tests/data/t0/gpt2-tiny-vocab.json deleted file mode 100644 index f83c21b01..000000000 --- a/tests/data/t0/gpt2-tiny-vocab.json +++ /dev/null @@ -1 +0,0 @@ -{"!":0,"\"":1,"#":2,"$":3,"%":4,"&":5,"'":6,"(":7,")":8,"*":9,"+":10,",":11,"-":12,".":13,"/":14,"0":15,"1":16,"2":17,"3":18,"4":19,"5":20,"6":21,"7":22,"8":23,"9":24,":":25,";":26,"<":27,"=":28,">":29,"?":30,"@":31,"A":32,"B":33,"C":34,"D":35,"E":36,"F":37,"G":38,"H":39,"I":40,"J":41,"K":42,"L":43,"M":44,"N":45,"O":46,"P":47,"Q":48,"R":49,"S":50,"T":51,"U":52,"V":53,"W":54,"X":55,"Y":56,"Z":57,"[":58,"\\":59,"]":60,"^":61,"_":62,"`":63,"a":64,"b":65,"c":66,"d":67,"e":68,"f":69,"g":70,"h":71,"i":72,"j":73,"k":74,"l":75,"m":76,"n":77,"o":78,"p":79,"q":80,"r":81,"s":82,"t":83,"u":84,"v":85,"w":86,"x":87,"y":88,"z":89,"{":90,"|":91,"}":92,"~":93,"¡":94,"¢":95,"£":96,"¤":97,"¥":98,"¦":99,"§":100,"¨":101,"©":102,"ª":103,"«":104,"¬":105,"®":106,"¯":107,"°":108,"±":109,"²":110,"³":111,"´":112,"µ":113,"¶":114,"·":115,"¸":116,"¹":117,"º":118,"»":119,"¼":120,"½":121,"¾":122,"¿":123,"À":124,"Á":125,"Â":126,"Ã":127,"Ä":128,"Å":129,"Æ":130,"Ç":131,"È":132,"É":133,"Ê":134,"Ë":135,"Ì":136,"Í":137,"Î":138,"Ï":139,"Ð":140,"Ñ":141,"Ò":142,"Ó":143,"Ô":144,"Õ":145,"Ö":146,"×":147,"Ø":148,"Ù":149,"Ú":150,"Û":151,"Ü":152,"Ý":153,"Þ":154,"ß":155,"à":156,"á":157,"â":158,"ã":159,"ä":160,"å":161,"æ":162,"ç":163,"è":164,"é":165,"ê":166,"ë":167,"ì":168,"í":169,"î":170,"ï":171,"ð":172,"ñ":173,"ò":174,"ó":175,"ô":176,"õ":177,"ö":178,"÷":179,"ø":180,"ù":181,"ú":182,"û":183,"ü":184,"ý":185,"þ":186,"ÿ":187,"Ā":188,"ā":189,"Ă":190,"ă":191,"Ą":192,"ą":193,"Ć":194,"ć":195,"Ĉ":196,"ĉ":197,"Ċ":198,"ċ":199,"Č":200,"č":201,"Ď":202,"ď":203,"Đ":204,"đ":205,"Ē":206,"ē":207,"Ĕ":208,"ĕ":209,"Ė":210,"ė":211,"Ę":212,"ę":213,"Ě":214,"ě":215,"Ĝ":216,"ĝ":217,"Ğ":218,"ğ":219,"Ġ":220,"ġ":221,"Ģ":222,"ģ":223,"Ĥ":224,"ĥ":225,"Ħ":226,"ħ":227,"Ĩ":228,"ĩ":229,"Ī":230,"ī":231,"Ĭ":232,"ĭ":233,"Į":234,"į":235,"İ":236,"ı":237,"IJ":238,"ij":239,"Ĵ":240,"ĵ":241,"Ķ":242,"ķ":243,"ĸ":244,"Ĺ":245,"ĺ":246,"Ļ":247,"ļ":248,"Ľ":249,"ľ":250,"Ŀ":251,"ŀ":252,"Ł":253,"ł":254,"Ń":255,"Ġt":256,"Ġa":257,"he":258,"in":259,"re":260,"on":261,"Ġthe":262,"er":263,"Ġs":264,"at":265,"Ġw":266,"Ġo":267,"en":268,"Ġc":269,"it":270,"is":271,"an":272,"or":273,"es":274,"Ġb":275,"ed":276,"Ġf":277,"ing":278,"Ġp":279,"ou":280,"Ġan":281,"al":282,"ar":283,"Ġto":284,"Ġm":285,"Ġof":286,"Ġin":287,"Ġd":288,"Ġh":289,"Ġand":290,"ic":291,"as":292,"le":293,"Ġth":294,"ion":295,"om":296,"ll":297,"ent":298,"Ġn":299,"Ġl":300,"st":301,"Ġre":302,"ve":303,"Ġe":304,"ro":305,"ly":306,"Ġbe":307,"Ġg":308,"ĠT":309,"ct":310,"ĠS":311,"id":312,"ot":313,"ĠI":314,"ut":315,"et":316,"ĠA":317,"Ġis":318,"Ġon":319,"im":320,"am":321,"ow":322,"ay":323,"ad":324,"se":325,"Ġthat":326,"ĠC":327,"ig":328,"Ġfor":329,"ac":330,"Ġy":331,"ver":332,"ur":333,"Ġu":334,"ld":335,"Ġst":336,"ĠM":337,"'s":338,"Ġhe":339,"Ġit":340,"ation":341,"ith":342,"ir":343,"ce":344,"Ġyou":345,"il":346,"ĠB":347,"Ġwh":348,"ol":349,"ĠP":350,"Ġwith":351,"Ġ1":352,"ter":353,"ch":354,"Ġas":355,"Ġwe":356,"Ġ(":357,"nd":358,"ill":359,"ĠD":360,"if":361,"Ġ2":362,"ag":363,"ers":364,"ke":365,"Ġ\"":366,"ĠH":367,"em":368,"Ġcon":369,"ĠW":370,"ĠR":371,"her":372,"Ġwas":373,"Ġr":374,"od":375,"ĠF":376,"ul":377,"ate":378,"Ġat":379,"ri":380,"pp":381,"ore":382,"ĠThe":383,"Ġse":384,"us":385,"Ġpro":386,"Ġha":387,"um":388,"Ġare":389,"Ġde":390,"ain":391,"and":392,"Ġor":393,"igh":394,"est":395,"ist":396,"ab":397,"rom":398,"ĠN":399,"th":400,"Ġcom":401,"ĠG":402,"un":403,"op":404,"00":405,"ĠL":406,"Ġnot":407,"ess":408,"Ġex":409,"Ġv":410,"res":411,"ĠE":412,"ew":413,"ity":414,"ant":415,"Ġby":416,"el":417,"os":418,"ort":419,"oc":420,"qu":421,"Ġfrom":422,"Ġhave":423,"Ġsu":424,"ive":425,"ould":426,"Ġsh":427,"Ġthis":428,"nt":429,"ra":430,"pe":431,"ight":432,"art":433,"ment":434,"Ġal":435,"ust":436,"end":437,"--":438,"all":439,"ĠO":440,"ack":441,"Ġch":442,"Ġle":443,"ies":444,"red":445,"ard":446,"âĢ":447,"out":448,"ĠJ":449,"Ġab":450,"ear":451,"iv":452,"ally":453,"our":454,"ost":455,"gh":456,"pt":457,"Ġpl":458,"ast":459,"Ġcan":460,"ak":461,"ome":462,"ud":463,"The":464,"Ġhis":465,"Ġdo":466,"Ġgo":467,"Ġhas":468,"ge":469,"'t":470,"ĠU":471,"rou":472,"Ġsa":473,"Ġj":474,"Ġbut":475,"Ġwor":476,"Ġall":477,"ect":478,"Ġk":479,"ame":480,"Ġwill":481,"ok":482,"Ġwhe":483,"Ġthey":484,"ide":485,"01":486,"ff":487,"ich":488,"pl":489,"ther":490,"Ġtr":491,"..":492,"Ġint":493,"ie":494,"ure":495,"age":496,"Ġne":497,"ial":498,"ap":499,"ine":500,"ice":501,"Ġme":502,"Ġout":503,"ans":504,"one":505,"ong":506,"ions":507,"Ġwho":508,"ĠK":509,"Ġup":510,"Ġtheir":511,"Ġad":512,"Ġ3":513,"Ġus":514,"ated":515,"ous":516,"Ġmore":517,"ue":518,"og":519,"ĠSt":520,"ind":521,"ike":522,"Ġso":523,"ime":524,"per":525,".\"":526,"ber":527,"iz":528,"act":529,"Ġone":530,"Ġsaid":531,"Ġ-":532,"are":533,"Ġyour":534,"cc":535,"ĠTh":536,"Ġcl":537,"ep":538,"ake":539,"able":540,"ip":541,"Ġcont":542,"Ġwhich":543,"ia":544,"Ġim":545,"Ġabout":546,"Ġwere":547,"very":548,"ub":549,"Ġhad":550,"Ġen":551,"Ġcomp":552,",\"":553,"ĠIn":554,"Ġun":555,"Ġag":556,"ire":557,"ace":558,"au":559,"ary":560,"Ġwould":561,"ass":562,"ry":563,"ĠâĢ":564,"cl":565,"ook":566,"ere":567,"so":568,"ĠV":569,"ign":570,"ib":571,"Ġoff":572,"Ġte":573,"ven":574,"ĠY":575,"ile":576,"ose":577,"ite":578,"orm":579,"Ġ201":580,"Ġres":581,"Ġman":582,"Ġper":583,"Ġother":584,"ord":585,"ult":586,"Ġbeen":587,"Ġlike":588,"ase":589,"ance":590,"ks":591,"ays":592,"own":593,"ence":594,"Ġdis":595,"ction":596,"Ġany":597,"Ġapp":598,"Ġsp":599,"int":600,"ress":601,"ations":602,"ail":603,"Ġ4":604,"ical":605,"Ġthem":606,"Ġher":607,"ount":608,"ĠCh":609,"Ġar":610,"Ġif":611,"Ġthere":612,"Ġpe":613,"Ġyear":614,"av":615,"Ġmy":616,"Ġsome":617,"Ġwhen":618,"ough":619,"ach":620,"Ġthan":621,"ru":622,"ond":623,"ick":624,"Ġover":625,"vel":626,"Ġqu":627,"ĊĊ":628,"Ġsc":629,"reat":630,"ree":631,"ĠIt":632,"ound":633,"port":634,"Ġalso":635,"Ġpart":636,"fter":637,"Ġkn":638,"Ġbec":639,"Ġtime":640,"ens":641,"Ġ5":642,"ople":643,"Ġwhat":644,"Ġno":645,"du":646,"mer":647,"ang":648,"Ġnew":649,"----":650,"Ġget":651,"ory":652,"ition":653,"ings":654,"Ġjust":655,"Ġinto":656,"Ġ0":657,"ents":658,"ove":659,"te":660,"Ġpeople":661,"Ġpre":662,"Ġits":663,"Ġrec":664,"Ġtw":665,"ian":666,"irst":667,"ark":668,"ors":669,"Ġwork":670,"ade":671,"ob":672,"Ġshe":673,"Ġour":674,"wn":675,"ink":676,"lic":677,"Ġ19":678,"ĠHe":679,"ish":680,"nder":681,"ause":682,"Ġhim":683,"ons":684,"Ġ[":685,"Ġro":686,"form":687,"ild":688,"ates":689,"vers":690,"Ġonly":691,"oll":692,"Ġspe":693,"ck":694,"ell":695,"amp":696,"Ġacc":697,"Ġbl":698,"ious":699,"urn":700,"ft":701,"ood":702,"Ġhow":703,"hed":704,"Ġ'":705,"Ġafter":706,"aw":707,"Ġatt":708,"ov":709,"ne":710,"Ġplay":711,"erv":712,"ict":713,"Ġcould":714,"itt":715,"Ġam":716,"Ġfirst":717,"Ġ6":718,"Ġact":719,"Ġ$":720,"ec":721,"hing":722,"ual":723,"ull":724,"Ġcomm":725,"oy":726,"old":727,"ces":728,"ater":729,"Ġfe":730,"Ġbet":731,"we":732,"iff":733,"Ġtwo":734,"ock":735,"Ġback":736,").":737,"ident":738,"Ġunder":739,"rough":740,"sel":741,"xt":742,"Ġmay":743,"round":744,"Ġpo":745,"ph":746,"iss":747,"Ġdes":748,"Ġmost":749,"Ġdid":750,"Ġadd":751,"ject":752,"Ġinc":753,"fore":754,"Ġpol":755,"ont":756,"Ġagain":757,"clud":758,"tern":759,"Ġknow":760,"Ġneed":761,"Ġcons":762,"Ġco":763,"Ġ.":764,"Ġwant":765,"Ġsee":766,"Ġ7":767,"ning":768,"iew":769,"ĠThis":770,"ced":771,"Ġeven":772,"Ġind":773,"ty":774,"ĠWe":775,"ath":776,"Ġthese":777,"Ġpr":778,"Ġuse":779,"Ġbecause":780,"Ġfl":781,"ng":782,"Ġnow":783,"ĠâĢĵ":784,"com":785,"ise":786,"Ġmake":787,"Ġthen":788,"ower":789,"Ġevery":790,"ĠUn":791,"Ġsec":792,"oss":793,"uch":794,"Ġem":795,"Ġ=":796,"ĠRe":797,"ied":798,"rit":799,"Ġinv":800,"lect":801,"Ġsupp":802,"ating":803,"Ġlook":804,"man":805,"pect":806,"Ġ8":807,"row":808,"Ġbu":809,"Ġwhere":810,"ific":811,"Ġyears":812,"ily":813,"Ġdiff":814,"Ġshould":815,"Ġrem":816,"Th":817,"In":818,"Ġev":819,"day":820,"'re":821,"rib":822,"Ġrel":823,"ss":824,"Ġdef":825,"Ġright":826,"Ġsy":827,"),":828,"les":829,"000":830,"hen":831,"Ġthrough":832,"ĠTr":833,"__":834,"Ġway":835,"Ġdon":836,"Ġ,":837,"Ġ10":838,"ased":839,"Ġass":840,"ublic":841,"Ġreg":842,"ĠAnd":843,"ix":844,"Ġvery":845,"Ġinclud":846,"other":847,"Ġimp":848,"oth":849,"Ġsub":850,"ĠâĢĶ":851,"Ġbeing":852,"arg":853,"ĠWh":854,"==":855,"ible":856,"Ġdoes":857,"ange":858,"ram":859,"Ġ9":860,"ert":861,"ps":862,"ited":863,"ational":864,"Ġbr":865,"Ġdown":866,"Ġmany":867,"aking":868,"Ġcall":869,"uring":870,"ities":871,"Ġph":872,"ics":873,"als":874,"Ġdec":875,"ative":876,"ener":877,"Ġbefore":878,"ility":879,"Ġwell":880,"Ġmuch":881,"erson":882,"Ġthose":883,"Ġsuch":884,"Ġke":885,"Ġend":886,"ĠBut":887,"ason":888,"ting":889,"Ġlong":890,"ef":891,"Ġthink":892,"ys":893,"Ġbel":894,"Ġsm":895,"its":896,"ax":897,"Ġown":898,"Ġprov":899,"Ġset":900,"ife":901,"ments":902,"ble":903,"ward":904,"Ġshow":905,"Ġpres":906,"ms":907,"omet":908,"Ġob":909,"Ġsay":910,"ĠSh":911,"ts":912,"ful":913,"Ġeff":914,"Ġgu":915,"Ġinst":916,"und":917,"ren":918,"cess":919,"Ġent":920,"ĠYou":921,"Ġgood":922,"Ġstart":923,"ince":924,"Ġmade":925,"tt":926,"stem":927,"olog":928,"up":929,"Ġ|":930,"ump":931,"Ġhel":932,"vern":933,"ular":934,"ually":935,"Ġac":936,"Ġmon":937,"Ġlast":938,"Ġ200":939,"10":940,"Ġstud":941,"ures":942,"ĠAr":943,"self":944,"ars":945,"meric":946,"ues":947,"cy":948,"Ġmin":949,"ollow":950,"Ġcol":951,"io":952,"Ġmod":953,"Ġcount":954,"ĠCom":955,"hes":956,"Ġfin":957,"air":958,"ier":959,"âĢĶ":960,"read":961,"ank":962,"atch":963,"ever":964,"Ġstr":965,"Ġpoint":966,"ork":967,"ĠNew":968,"Ġsur":969,"ool":970,"alk":971,"ement":972,"Ġused":973,"ract":974,"ween":975,"Ġsame":976,"oun":977,"ĠAl":978,"ci":979,"Ġdiffere":980,"Ġwhile":981,"--------":982,"Ġgame":983,"cept":984,"Ġsim":985,"...":986,"Ġinter":987,"ek":988,"Ġreport":989,"Ġprodu":990,"Ġstill":991,"led":992,"ah":993,"Ġhere":994,"Ġworld":995,"Ġthough":996,"Ġnum":997,"arch":998,"imes":999,"ale":1000,"ĠSe":1001,"ĠIf":1002,"//":1003,"ĠLe":1004,"Ġret":1005,"Ġref":1006,"Ġtrans":1007,"ner":1008,"ution":1009,"ters":1010,"Ġtake":1011,"ĠCl":1012,"Ġconf":1013,"way":1014,"ave":1015,"Ġgoing":1016,"Ġsl":1017,"ug":1018,"ĠAmeric":1019,"Ġspec":1020,"Ġhand":1021,"Ġbetween":1022,"ists":1023,"ĠDe":1024,"oot":1025,"It":1026,"Ġear":1027,"Ġagainst":1028,"Ġhigh":1029,"gan":1030,"az":1031,"ather":1032,"Ġexp":1033,"Ġop":1034,"Ġins":1035,"Ġgr":1036,"Ġhelp":1037,"Ġrequ":1038,"ets":1039,"ins":1040,"ĠPro":1041,"ism":1042,"Ġfound":1043,"land":1044,"ata":1045,"uss":1046,"ames":1047,"Ġperson":1048,"Ġgreat":1049,"pr":1050,"Ġsign":1051,"ĠAn":1052,"'ve":1053,"Ġsomet":1054,"Ġser":1055,"hip":1056,"Ġrun":1057,"Ġ:":1058,"Ġter":1059,"irect":1060,"Ġfollow":1061,"Ġdet":1062,"ices":1063,"Ġfind":1064,"12":1065,"Ġmem":1066,"Ġcr":1067,"ered":1068,"ex":1069,"Ġext":1070,"uth":1071,"ense":1072,"co":1073,"Ġteam":1074,"ving":1075,"ouse":1076,"ash":1077,"att":1078,"ved":1079,"Ġsystem":1080,"ĠAs":1081,"der":1082,"ives":1083,"min":1084,"Ġlead":1085,"ĠBl":1086,"cent":1087,"Ġaround":1088,"Ġgovern":1089,"Ġcur":1090,"velop":1091,"any":1092,"Ġcour":1093,"alth":1094,"ages":1095,"ize":1096,"Ġcar":1097,"ode":1098,"Ġlaw":1099,"Ġread":1100,"'m":1101,"con":1102,"Ġreal":1103,"Ġsupport":1104,"Ġ12":1105,"....":1106,"Ġreally":1107,"ness":1108,"Ġfact":1109,"Ġday":1110,"Ġboth":1111,"ying":1112,"Ġserv":1113,"ĠFor":1114,"Ġthree":1115,"Ġwom":1116,"Ġmed":1117,"ody":1118,"ĠThey":1119,"50":1120,"Ġexper":1121,"ton":1122,"Ġeach":1123,"akes":1124,"Ġche":1125,"Ġcre":1126,"ines":1127,"Ġrep":1128,"19":1129,"gg":1130,"illion":1131,"Ġgrou":1132,"ute":1133,"ik":1134,"We":1135,"get":1136,"ER":1137,"Ġmet":1138,"Ġsays":1139,"ox":1140,"Ġduring":1141,"ern":1142,"ized":1143,"ared":1144,"Ġfam":1145,"ically":1146,"Ġhapp":1147,"ĠIs":1148,"Ġchar":1149,"med":1150,"vent":1151,"Ġgener":1152,"ient":1153,"ple":1154,"iet":1155,"rent":1156,"11":1157,"ves":1158,"ption":1159,"Ġ20":1160,"formation":1161,"Ġcor":1162,"Ġoffic":1163,"ield":1164,"Ġtoo":1165,"ision":1166,"Ġinf":1167,"ĠZ":1168,"the":1169,"oad":1170,"Ġpublic":1171,"Ġprog":1172,"ric":1173,"**":1174,"Ġwar":1175,"Ġpower":1176,"view":1177,"Ġfew":1178,"Ġloc":1179,"Ġdifferent":1180,"Ġstate":1181,"Ġhead":1182,"'ll":1183,"Ġposs":1184,"Ġstat":1185,"ret":1186,"ants":1187,"Ġval":1188,"Ġiss":1189,"Ġcle":1190,"ivers":1191,"anc":1192,"Ġexpl":1193,"Ġanother":1194,"ĠQ":1195,"Ġav":1196,"thing":1197,"nce":1198,"Wh":1199,"Ġchild":1200,"Ġsince":1201,"ired":1202,"less":1203,"Ġlife":1204,"Ġdevelop":1205,"ittle":1206,"Ġdep":1207,"Ġpass":1208,"ãĥ":1209,"Ġturn":1210,"orn":1211,"This":1212,"bers":1213,"ross":1214,"ĠAd":1215,"Ġfr":1216,"Ġresp":1217,"Ġsecond":1218,"oh":1219,"Ġ/":1220,"Ġdisc":1221,"Ġ&":1222,"Ġsomething":1223,"Ġcomple":1224,"Ġed":1225,"Ġfil":1226,"Ġmonth":1227,"aj":1228,"uc":1229,"Ġgovernment":1230,"Ġwithout":1231,"Ġleg":1232,"Ġdist":1233,"Ġput":1234,"Ġquest":1235,"ann":1236,"Ġprot":1237,"20":1238,"Ġnever":1239,"ience":1240,"Ġlevel":1241,"Ġart":1242,"Ġthings":1243,"Ġmight":1244,"Ġeffect":1245,"Ġcontro":1246,"Ġcent":1247,"Ġ18":1248,"Ġallow":1249,"Ġbelie":1250,"chool":1251,"ott":1252,"Ġincre":1253,"Ġfeel":1254,"Ġresult":1255,"Ġlot":1256,"Ġfun":1257,"ote":1258,"Ġty":1259,"erest":1260,"Ġcontin":1261,"Ġusing":1262,"Ġbig":1263,"201":1264,"Ġask":1265,"Ġbest":1266,"Ġ)":1267,"IN":1268,"Ġopp":1269,"30":1270,"Ġnumber":1271,"iness":1272,"St":1273,"lease":1274,"Ġca":1275,"Ġmust":1276,"Ġdirect":1277,"Ġgl":1278,"Ġ<":1279,"Ġopen":1280,"Ġpost":1281,"Ġcome":1282,"Ġseem":1283,"ording":1284,"Ġweek":1285,"ately":1286,"ital":1287,"Ġel":1288,"riend":1289,"Ġfar":1290,"Ġtra":1291,"inal":1292,"Ġpri":1293,"ĠUS":1294,"Ġplace":1295,"Ġform":1296,"Ġtold":1297,"\":":1298,"ains":1299,"ature":1300,"ĠTrump":1301,"Ġstand":1302,"Ġ#":1303,"ider":1304,"ĠFr":1305,"Ġnext":1306,"Ġsoc":1307,"Ġpur":1308,"Ġlet":1309,"Ġlittle":1310,"Ġhum":1311,"Ġi":1312,"ron":1313,"15":1314,"Ġ15":1315,"Ġcommun":1316,"Ġmark":1317,"ĠThere":1318,"Ġwr":1319,"ĠThat":1320,"Ġinformation":1321,"ways":1322,"Ġbus":1323,"app":1324,"Ġinvest":1325,"me":1326,"Ġhard":1327,"ained":1328,"ead":1329,"Ġimport":1330,"Ġappro":1331,"Ġtest":1332,"Ġtri":1333,"Ġrest":1334,"osed":1335,"Ġfull":1336,"Ġcare":1337,"ĠSp":1338,"Ġcase":1339,"ON":1340,"Ġsk":1341,"Ġless":1342,"Ġ+":1343,"Ġpartic":1344,"ĠPl":1345,"ably":1346,"uck":1347,"ished":1348,"chn":1349,"be":1350,"Ġlist":1351,"ator":1352,"Ġtop":1353,"Ġadv":1354,"ĠBe":1355,"ruct":1356,"Ġdem":1357,"ration":1358,"ling":1359,"gy":1360,"reen":1361,"ger":1362,"Ġhome":1363,"Ġleft":1364,"Ġbetter":1365,"Ġdata":1366,"Ġ11":1367,"Ġattack":1368,"Ġproble":1369,"line":1370,"ards":1371,"Ġbeh":1372,"ral":1373,"ĠHow":1374,"ĠShe":1375,"arge":1376,"Ġ--":1377,"://":1378,"Ġbro":1379,"ĠPh":1380,"ats":1381,"Ġbuild":1382,"ww":1383,"ided":1384,"aim":1385,"ases":1386,"ency":1387,"Ġmain":1388,"ined":1389,"Ġincluding":1390,"Ġ{":1391,"Ġgot":1392,"Ġinterest":1393,"Ġkeep":1394,"ĠX":1395,"Ġeas":1396,"aining":1397,"Ġclass":1398,"âĢ¦":1399,"ĠNo":1400,"Ġvar":1401,"Ġsmall":1402,"ample":1403,"AT":1404,"Ġide":1405,"ĠSo":1406,"Ġrece":1407,"Ġpolit":1408,"Ġmov":1409,"Ġplan":1410,"Ġpercent":1411,"iving":1412,"Ġcamp":1413,"Ġpay":1414,"14":1415,"sc":1416,"ised":1417,"Ġunt":1418,"oney":1419,"ploy":1420,"====":1421,"Ġdidn":1422,"ĠInd":1423,"els":1424,"ertain":1425,"Ġpos":1426,"____":1427,"iver":1428,"Ġprocess":1429,"Ġprogram":1430,"ified":1431,"ĠRep":1432,"16":1433,"uro":1434,"ology":1435,"atter":1436,"ina":1437,"Ġname":1438,"ĠAll":1439,"Ġfour":1440,"Ġreturn":1441,"vious":1442,"bs":1443,"Ġcalled":1444,"Ġmove":1445,"ĠSc":1446,"ird":1447,"Ġgroup":1448,"Ġbre":1449,"Ġmen":1450,"Ġcap":1451,"ten":1452,"ee":1453,"Ġdri":1454,"leg":1455,"here":1456,"uthor":1457,"Ġpat":1458,"Ġcurrent":1459,"ides":1460,"Ġpop":1461,"to":1462,"ention":1463,"Ġalways":1464,"Ġmil":1465,"Ġwomen":1466,"Ġ16":1467,"Ġold":1468,"iven":1469,"raph":1470,"ĠOr":1471,"ror":1472,"ently":1473,"Ġnear":1474,"ĠEx":1475,"ream":1476,"sh":1477,"Ġ14":1478,"Ġfree":1479,"ission":1480,"stand":1481,"ĠCon":1482,"ality":1483,"used":1484,"13":1485,"Ġdesign":1486,"Ġchange":1487,"Ġchang":1488,"Ġbo":1489,"Ġvis":1490,"ember":1491,"Ġbook":1492,"ready":1493,"Ġkill":1494,"25":1495,"pped":1496,"Ġaway":1497,"Ġable":1498,"Ġcountry":1499,"Ġconst":1500,"arn":1501,"Ġorder":1502,"AR":1503,"ior":1504,"ium":1505,"orth":1506,"18":1507,"ailable":1508,"Ġsw":1509,"Ġmillion":1510,"Ġ13":1511,"atic":1512,"ted":1513,"ĠGo":1514,"Ġoper":1515,"eng":1516,"Ġthing":1517,"ajor":1518,"conom":1519,"ĠComm":1520,"Ġwhy":1521,"ured":1522,"ural":1523,"Ġschool":1524,"by":1525,"ĠMar":1526,"Ġaff":1527,"Ġdays":1528,"Ġann":1529,"ush":1530,"ane":1531,"If":1532,"eg":1533,"Ġprof":1534,"Ġhealth":1535,"outh":1536,"But":1537,"ional":1538,".,":1539,"Ġsol":1540,"Ġalready":1541,"Ġ30":1542,"Ġcharact":1543,"He":1544,"Ġfriend":1545,"ES":1546,"ians":1547,"icle":1548,"'d":1549,"ĠOn":1550,"Ġleast":1551,"Ġprom":1552,"Ġdr":1553,"Ġhist":1554,"ither":1555,"Ġest":1556,"iqu":1557,"17":1558,"son":1559,"Ġtell":1560,"Ġtalk":1561,"ohn":1562,"oint":1563,"lection":1564,"AN":1565,"Ġuntil":1566,"augh":1567,"Ġlater":1568,"Ġve":1569,"Ġview":1570,"ending":1571,"ived":1572,"Ġword":1573,"ware":1574,"Ġcost":1575,"Ġenough":1576,"Ġgive":1577,"ĠUnited":1578,"Ġtechn":1579,"arent":1580,"OR":1581,"Ġpar":1582,"ĠDr":1583,"Ġ2016":1584,"rist":1585,"ering":1586,"ĠÂ":1587,"Ġlarge":1588,"side":1589,"acy":1590,"ccess":1591,"Ġwin":1592,"Ġimportant":1593,"Ġ199":1594,"Ġdoesn":1595,"Ġ17":1596,"Ġbusiness":1597,"Ġclear":1598,"Ġrese":1599,"\",":1600,"ury":1601,"Ġequ":1602,"aster":1603,"alf":1604,"ĠAmerican":1605,"nect":1606,"Ġexpect":1607,"iversity":1608,"Ġocc":1609,"ĠFl":1610,"Ġkind":1611,"Ġmean":1612,"Ġpast":1613,"Ġdev":1614,"Ġbas":1615,"let":1616,"raft":1617,"Ġorgan":1618,"Ġdel":1619,"Ġperform":1620,"Ġstory":1621,"Ġseason":1622,"ĠCol":1623,"Ġclaim":1624,"Ġcame":1625,"Ġwithin":1626,"Ġline":1627,"Ġproject":1628,"ĠAt":1629,"Ġcontrol":1630,"ended":1631,"ĠSy":1632,"Ġair":1633,"ization":1634,"Ġ*":1635,"ley":1636,"Ġmoney":1637,"idd":1638,"You":1639,"for":1640,"Ġfamily":1641,"Ġmaking":1642,"Ġbit":1643,"Ġpolice":1644,"Ġhappen":1645,"Ġvers":1646,"ony":1647,"uff":1648,"ĠWhen":1649,"Ġsit":1650,"ideo":1651,"lf":1652,"ison":1653,"Ġsure":1654,"gin":1655,"Ġappear":1656,"Ġlight":1657,"Ġes":1658,"of":1659,"Ġwater":1660,"Ġtimes":1661,"not":1662,"Ġgrow":1663,"Ġcompany":1664,"ĠTe":1665,"ows":1666,"Ġmar":1667,"ource":1668,"iol":1669,"arm":1670,"br":1671,"Ġexample":1672,"Ġconc":1673,"Ġfore":1674,"ĠTo":1675,"pro":1676,"EN":1677,"ries":1678,"Ġ25":1679,"ĠCan":1680,"ney":1681,"Ġactually":1682,"Ġever":1683,"urity":1684,"aken":1685,"aps":1686,"Ġtax":1687,"Ġmajor":1688,"ama":1689,"Ġoften":1690,"eral":1691,"Ġhuman":1692,"Ġjob":1693,"ister":1694,"Ġavailable":1695,"ocr":1696,"enn":1697,"aid":1698,"ivid":1699,"Ġrecord":1700,"?\"":1701,"Ġsing":1702,"ĠAm":1703,"idence":1704,"Ġnews":1705,"ster":1706,"Ġeconom":1707,"Ġfollowing":1708,"ĠBr":1709,"ising":1710,"Ġhour":1711,"most":1712,"ument":1713,"Ġsex":1714,"Ġdesc":1715,"Ġbecome":1716,"ĠEd":1717,"Ġtook":1718,"Ġhaving":1719,"Ġproduct":1720,"ault":1721,"As":1722,"aring":1723,"Ġmeans":1724,"Ġhop":1725,"une":1726,"Ġcho":1727,"Ġcertain":1728,"Ġnon":1729,"Ġdeal":1730,"24":1731,"lement":1732,"oci":1733,"ene":1734,"Ġside":1735,"ĠPr":1736,"ĠMay":1737,"Ġreason":1738,"ued":1739,"ched":1740,"ulation":1741,"Ġelect":1742,"Ġofficial":1743,"Ġpossible":1744,"Ġhold":1745,"ands":1746,"ots":1747,"Ġcity":1748,"ories":1749,"Ġsever":1750,"Ġchildren":1751,"Ġonce":1752,"Ġactiv":1753,"ler":1754,"Ġnight":1755,"itions":1756,"ĠJohn":1757,"ape":1758,"play":1759,"Ġdone":1760,"Ġlim":1761,"Ġworking":1762,"ĠPres":1763,"orld":1764,"eb":1765,"ĠCo":1766,"Ġbody":1767,"ails":1768,"utes":1769,"ĠMr":1770,"Ġwhether":1771,"Ġauthor":1772,"rop":1773,"Ġproper":1774,"Ġseen":1775,");":1776,"Ġfac":1777,"ĠSu":1778,"Ġcond":1779,"iting":1780,"Ġcourse":1781,"Ġ}":1782,"----------------":1783,"aign":1784,"Ġevent":1785,"Ġeng":1786,"Ġpot":1787,"Ġintern":1788,"iam":1789,"Ġshort":1790,"empt":1791,"ãĤ":1792,"ĠGod":1793,"ilar":1794,"80":1795,"Ġorig":1796,"IS":1797,"ourn":1798,"ability":1799,"itive":1800,"Ġdam":1801,"Ġ100":1802,"Ġpress":1803,"Ġdoing":1804,"Ġprotect":1805,"ring":1806,"Ġthought":1807,"Ġquestion":1808,"rew":1809,"ĠWar":1810,"Ġseveral":1811,"ĠState":1812,"Ġgiven":1813,"Ġfund":1814,"ĠTw":1815,"Ġwent":1816,"ances":1817,"work":1818,"por":1819,"my":1820,"40":1821,"Ġarg":1822,"artment":1823,"ustom":1824,"Ġpolic":1825,"Ġmeet":1826,"Ġcreat":1827,"22":1828,"ĠStates":1829,"Ġgames":1830,"raw":1831,"uture":1832,"Ġunderstand":1833,"urs":1834,"ĠOb":1835,"lish":1836,"sy":1837,"Ġmakes":1838,"Ġwon":1839,"agon":1840,"Ġhtt":1841,"Ġlove":1842,"ential":1843,"Ġcomplete":1844,"par":1845,"ĠIm":1846,"AL":1847,"Ġaccount":1848,"Âł":1849,"ored":1850,"vert":1851,"Ġident":1852,"Ġ2015":1853,"Ġothers":1854,"ĠMin":1855,"iber":1856,"verage":1857,"There":1858,"itional":1859,"dd":1860,"Ġprob":1861,"Ġyoung":1862,"Ġalong":1863,"Ġaccording":1864,"Ġyet":1865,"Ġmembers":1866,"ĠWhat":1867,"oid":1868,"ĠMan":1869,"And":1870,"Ġamong":1871,"ai":1872,"Ġemploy":1873,"ĠRes":1874,"Ġ>":1875,"Ġinvol":1876,"Ġlow":1877,"af":1878,"ĠCar":1879,"Ġhig":1880,"ĠOne":1881,"ĠSec":1882,"ination":1883,"Ġlikely":1884,"Ġant":1885,"aged":1886,"ĠRuss":1887,"Ġben":1888,"Ġrele":1889,"For":1890,"back":1891,"ĠNot":1892,"Ġpresident":1893,"ball":1894,"Ġaccess":1895,"ividual":1896,"ĠDem":1897,"ĠEuro":1898,"60":1899,"Ġknown":1900,"irl":1901,"ĠGr":1902,"Ġearly":1903,"use":1904,"iety":1905,"âĢĵ":1906,"Ġfight":1907,"Ġsent":1908,"Ġtoday":1909,"Ġmarket":1910,"\".":1911,"Ġbased":1912,"Ġstrong":1913,"urther":1914,"Ġdeb":1915,"mber":1916,"Ġproblem":1917,"Ġdeath":1918,"Ġsocial":1919,"imate":1920,"AS":1921,"ortun":1922,"Ġcampaign":1923,"ery":1924,"Ch":1925,"Ġey":1926,"ially":1927,"Ġmus":1928,"wh":1929,"pos":1930,"Ġer":1931,"Ġsaf":1932,"Ġmonths":1933,"iron":1934,"Ġviol":1935,"Ġfive":1936,"Ġstre":1937,"Ġplayers":1938,"inc":1939,"ald":1940,"year":1941,"aun":1942,"Ġsuccess":1943,"Ġpresent":1944,"erence":1945,"Ġ2014":1946,"Ġsugg":1947,"Ġparticular":1948,"Ġtry":1949,"Ġsuggest":1950,"ĠChrist":1951,"ones":1952,"Ġpriv":1953,"23":1954,"Ġcrit":1955,"Ġland":1956,"Ġlocal":1957,"ify":1958,"29":1959,"Ġaut":1960,"ED":1961,"ĠGu":1962,"Ġmult":1963,"Ġpolitical":1964,"Ġasked":1965,"Ġformer":1966,"itter":1967,"ript":1968,"Ġclose":1969,"Ġpract":1970,"ĠYork":1971,"Ġgetting":1972,"Ġacross":1973,"Ġcomb":1974,"Ġbelieve":1975,"Ġz":1976,"Ġtoget":1977,"Ġtogether":1978,"ĠCent":1979,"irc":1980,"Ġindividual":1981,"ĠMc":1982,"27":1983,"isk":1984,"ĠEng":1985,"Ġface":1986,"Ġ24":1987,"Ġvalue":1988,"Ġarea":1989,"ev":1990,"Ġwrit":1991,"ĠPresident":1992,"Ġvot":1993,"Ġkey":1994,"Ġmom":1995,"put":1996,"Ġanything":1997,"Ġexperience":1998,"attle":1999,"Ġmind":2000,"aff":2001,"omm":2002,"Ġfuture":2003,"ged":2004,"Ġcut":2005,"Ġtot":2006,"itch":2007,"Ġvideo":2008,"Ġinvestig":2009,"Ġnet":2010,"ĠMy":2011,"rict":2012,"ien":2013,".)":2014,"Ġimpro":2015,"though":2016,"wards":2017,"Ġconnect":2018,"ĠMed":2019,"selves":2020,"ensive":2021,"mb":2022,"ober":2023,"ators":2024,"An":2025,"Ġ50":2026,"Ġredu":2027,"resent":2028,"Ġabove":2029,"Ġfre":2030,"ĠEurope":2031,"sw":2032,"Ġamount":2033,"ĠApp":2034,"Ġeither":2035,"Ġmilit":2036,"Ġanal":2037,"Ġfail":2038,"ĠEn":2039,"ales":2040,"Ġspecial":2041,"Ġblack":2042,"IT":2043,"cher":2044,"Ġlooking":2045,"Ġfire":2046,"yn":2047,"Ġalmost":2048,"oon":2049,"Ġstudy":2050,"Ġmiss":2051,"ches":2052,"rown":2053,"Ġtre":2054,"Ġcommunity":2055,"Ġmedia":2056,"Ġfood":2057,"Ġcomes":2058,"ĠUniversity":2059,"Ġsingle":2060,"What":2061,"uly":2062,"Ġhalf":2063,"ague":2064,"hod":2065,"ĠRepublic":2066,"Ġstarted":2067,"Ġquick":2068,"oto":2069,"book":2070,"Ġissue":2071,"itor":2072,"Ġelse":2073,"Ġconsider":2074,"26":2075,"rodu":2076,"Ġtaken":2077,"28":2078,"99":2079,"ĠWith":2080,"Ġtrue":2081,"Ġwa":2082,"Ġtrad":2083,"Ġago":2084,"Ġmess":2085,"ief":2086,"Ġadded":2087,"oke":2088,"Ġbad":2089,"Ġfav":2090,"33":2091,"Ġsimilar":2092,"ask":2093,"ĠDon":2094,"Ġcharacter":2095,"orts":2096,"ĠHouse":2097,"Ġreported":2098,"Ġtype":2099,"val":2100,"iod":2101,"ĠHowever":2102,"Ġtarg":2103,"Ġentire":2104,"pping":2105,"Ġhistory":2106,"Ġlive":2107,"ffic":2108,"........":2109,"ederal":2110,"Ġtrying":2111,"Ġdiscuss":2112,"ĠHar":2113,"aces":2114,"lished":2115,"Ġself":2116,"osp":2117,"rest":2118,"Ġroom":2119,"elt":2120,"Ġfall":2121,"olution":2122,"Ġet":2123,"Ġx":2124,"Ġisn":2125,"Ġidea":2126,"bo":2127,"Ġsound":2128,"ĠDep":2129,"Ġsomeone":2130,"cially":2131,"ully":2132,"Ġfoc":2133,"Ġobject":2134,"ift":2135,"aper":2136,"Ġplayer":2137,"Ġrather":2138,"Ġservice":2139,"ashing":2140,"ĠDo":2141,"ĠPart":2142,"rug":2143,"mon":2144,"ply":2145,"Ġmor":2146,"Ġnothing":2147,"Ġprovide":2148,"IC":2149,"ung":2150,"Ġparty":2151,"Ġexist":2152,"Ġmag":2153,"70":2154,"Ġrul":2155,"Ġhouse":2156,"Ġbehind":2157,"Ġhowever":2158,"ĠWorld":2159,"Ġsum":2160,"Ġapplic":2161,"Ġ;":2162,"Ġfunction":2163,"gr":2164,"ĠPol":2165,"Ġfront":2166,"200":2167,"Ġseries":2168,"Ġtem":2169,"Ġtyp":2170,"ills":2171,"Ġopt":2172,"Ġpoints":2173,"Ġbelow":2174,"itted":2175,"Ġspecific":2176,"Ġ2017":2177,"umb":2178,"Ġra":2179,"Ġprevious":2180,"Ġpret":2181,"reme":2182,"Ġcustom":2183,"Ġcourt":2184,"ĠMe":2185,"Ġrepl":2186,"Ġwhole":2187,"go":2188,"cer":2189,"Ġtreat":2190,"ĠAct":2191,"Ġprobably":2192,"Ġlearn":2193,"ender":2194,"ĠAss":2195,"Ġversion":2196,"now":2197,"Ġcheck":2198,"ĠCal":2199,"RE":2200,"minist":2201,"On":2202,"ources":2203,"Ġbenef":2204,"Ġdoc":2205,"Ġdeter":2206,"Ġenc":2207,"Ġsuper":2208,"Ġaddress":2209,"Ġvict":2210,"Ġ2013":2211,"Ġmeas":2212,"tr":2213,"Ġfield":2214,"When":2215,"Ġsignific":2216,"uge":2217,"Ġfeat":2218,"Ġcommon":2219,"load":2220,"Ġbegin":2221,"Ġbring":2222,"Ġaction":2223,"erman":2224,"Ġdescrib":2225,"Ġindust":2226,"Ġwanted":2227,"ried":2228,"ming":2229,"Ġattempt":2230,"45":2231,"fer":2232,"Ġdue":2233,"ression":2234,"##":2235,"Ġshall":2236,"Ġsix":2237,"oo":2238,"Ġstep":2239,"Ġpub":2240,"Ġhimself":2241,"Ġ23":2242,"Ġcop":2243,"Ġdest":2244,"Ġstop":2245,"AC":2246,"ibility":2247,"Ġlab":2248,"icult":2249,"Ġhours":2250,"Ġcreate":2251,"Ġfurther":2252,"ĠAmerica":2253,"ĠCity":2254,"Ġdou":2255,"head":2256,"ST":2257,"ĠNorth":2258,"cing":2259,"Ġnational":2260,"ule":2261,"ĠInst":2262,"Ġtaking":2263,"ĠQu":2264,"irt":2265,"Ġred":2266,"Ġresearch":2267,"viron":2268,"ĠGe":2269,"Ġbreak":2270,"ana":2271,"Ġspace":2272,"aterial":2273,"Ġrecent":2274,"ĠAb":2275,"Ġgeneral":2276,"Ġhit":2277,"Ġperiod":2278,"Ġeverything":2279,"ively":2280,"Ġphys":2281,"Ġsaying":2282,"anks":2283,"Ġcou":2284,"Ġcult":2285,"aced":2286,"eal":2287,"uation":2288,"Ġcoun":2289,"lu":2290,"Ġinclude":2291,"Ġposition":2292,"ĠAfter":2293,"ĠCanad":2294,"ĠEm":2295,"Ġimm":2296,"ĠRed":2297,"Ġpick":2298,"Ġcompl":2299,"Ġmatter":2300,"reg":2301,"ext":2302,"angu":2303,"isc":2304,"ole":2305,"aut":2306,"Ġcompet":2307,"eed":2308,"fect":2309,"Ġ21":2310,"ĠSen":2311,"ĠThese":2312,"asing":2313,"Ġcannot":2314,"Ġinit":2315,"Ġrelations":2316,"ached":2317,"Ġbar":2318,"Ġ40":2319,"ĠTH":2320,"Ġ2012":2321,"Ġvol":2322,"Ġground":2323,"Ġsecurity":2324,"Ġupd":2325,"ilt":2326,"35":2327,"Ġconcern":2328,"ĠJust":2329,"Ġwhite":2330,"Ġseems":2331,"ĠHer":2332,"pecially":2333,"ients":2334,"Ġannoun":2335,"Ġfig":2336,"ights":2337,"Ġstri":2338,"like":2339,"ids":2340,"Ġsus":2341,"Ġwatch":2342,"Ġâ":2343,"Ġwind":2344,"ĠCont":2345,"Ġitself":2346,"Ġmass":2347,"Al":2348,"yle":2349,"ique":2350,"ĠNational":2351,"Ġabs":2352,"Ġpack":2353,"Ġoutside":2354,"Ġanim":2355,"Ġpain":2356,"eter":2357,"Ġmanag":2358,"duct":2359,"ogn":2360,"Ġ]":2361,"ĠSept":2362,"sec":2363,"off":2364,"ĠJan":2365,"Ġfoot":2366,"ades":2367,"Ġthird":2368,"Ġmot":2369,"Ġevidence":2370,"inton":2371,"Ġthreat":2372,"apt":2373,"ples":2374,"cle":2375,"Ġlo":2376,"Ġdecl":2377,"Ġitem":2378,"medi":2379,"Ġrepresent":2380,"omb":2381,"amer":2382,"Ġsignificant":2383,"ograph":2384,"su":2385,"Ġcal":2386,"ires":2387,"0000":2388,"ID":2389,"AM":2390,"Ġsimply":2391,"Ġlonger":2392,"Ġfile":2393,"OT":2394,"che":2395,"So":2396,"ateg":2397,"org":2398,"ĠHis":2399,"Ġener":2400,"Ġdom":2401,"Ġupon":2402,"ili":2403,"\":\"":2404,"Ġthemselves":2405,"Ġcoming":2406,"Ġquite":2407,"Ġdifficult":2408,"ĠBar":2409,"ilities":2410,"rel":2411,"ends":2412,"cial":2413,"64":2414,"Ġwoman":2415,"rap":2416,"yr":2417,"Ġnecess":2418,"ips":2419,"Ġtext":2420,"Ġrequire":2421,"Ġmilitary":2422,"Ġreview":2423,"Ġrespons":2424,"75":2425,"Ġsubject":2426,"Ġinstead":2427,"Ġissues":2428,"Ġgen":2429,"\",\"":2430,"Ġminutes":2431,"Ġweap":2432,"ray":2433,"amed":2434,"time":2435,"bl":2436,"How":2437,"Ġcode":2438,"ĠSm":2439,"Ġhigher":2440,"ĠSte":2441,"ris":2442,"Ġpage":2443,"Ġstudents":2444,"ĠIntern":2445,"Ġmethod":2446,"ĠAug":2447,"ĠPer":2448,"ĠAg":2449,"Ġpolicy":2450,"ĠSw":2451,"Ġexec":2452,"Ġaccept":2453,"ume":2454,"ribut":2455,"Ġwords":2456,"Ġfinal":2457,"Ġchanges":2458,"ĠDemocr":2459,"Ġfriends":2460,"Ġrespect":2461,"Ġep":2462,"Ġcompan":2463,"ivil":2464,"Ġdamage":2465,"****":2466,"ogle":2467,"vironment":2468,"Ġneg":2469,"ental":2470,"Ġap":2471,"Ġtotal":2472,"ival":2473,"!\"":2474,"lim":2475,"Ġneeds":2476,"Ġagre":2477,"Ġdevelopment":2478,"Ġage":2479,"iple":2480,"21":2481,"Ġresults":2482,"ĠAf":2483,"Sh":2484,"Ġgun":2485,"ĠObama":2486,"roll":2487,"Ġ@":2488,"Ġrights":2489,"ĠBrit":2490,"Ġrunning":2491,"Ġwasn":2492,"Ġport":2493,"Ġrate":2494,"Ġpretty":2495,"Ġtarget":2496,"Ġsaw":2497,"Ġcirc":2498,"Ġworks":2499,"icro":2500,"alt":2501,"over":2502,"www":2503,"That":2504,"lier":2505,"Ġeveryone":2506,"ude":2507,"Ġpie":2508,"iddle":2509,"rael":2510,"Ġrad":2511,"Ġblock":2512,"Ġwalk":2513,"To":2514,"ãģ":2515,"nes":2516,"ĠAust":2517,"aul":2518,"rote":2519,"ĠSouth":2520,"ession":2521,"oph":2522,"Ġshows":2523,"Ġsite":2524,"Ġjo":2525,"Ġrisk":2526,"clus":2527,"lt":2528,"Ġinj":2529,"iding":2530,"ĠSpe":2531,"Ġchall":2532,"irm":2533,"Ġ22":2534,"itting":2535,"str":2536,"Ġhy":2537,"LE":2538,"key":2539,"Ġbegan":2540,"atur":2541,"ashington":2542,"lam":2543,"ĠDav":2544,"bit":2545,"Ġsize":2546,"ĠPar":2547,"38":2548,"ournal":2549,"face":2550,"Ġdecision":2551,"Ġlarg":2552,"Ġjud":2553,"rect":2554,"Ġcontinue":2555,"ĠOct":2556,"overed":2557,"ĠInt":2558,"========":2559,"Ġparent":2560,"ĠWill":2561,"Ġeasy":2562,"Ġdrug":2563,"anger":2564,"Ġsense":2565,"Ġdi":2566,"iday":2567,"Ġenergy":2568,"istic":2569,"Ġassoci":2570,"arter":2571,"obal":2572,"eks":2573,"ĠEl":2574,"urch":2575,"Ġgirl":2576,"oe":2577,"itle":2578,"Ġ28":2579,"ĠChe":2580,"Ġrequest":2581,"Ġsoon":2582,"Ġhost":2583,"ky":2584,"Ġstates":2585,"omes":2586,"Ġmaterial":2587,"lex":2588,"Ġmoment":2589,"Ġansw":2590,"onse":2591,"Ġespecially":2592,"Ġnorm":2593,"Ġservices":2594,"pite":2595,"ran":2596,"Ġrole":2597,"44":2598,"):":2599,"Ġcred":2600,"Cl":2601,"________":2602,"Ġmat":2603,"Ġlog":2604,"ĠClinton":2605,"OU":2606,"Ġoffice":2607,"Ġ26":2608,"Ġcharg":2609,"Ġtrack":2610,"ma":2611,"Ġheart":2612,"Ġball":2613,"Ġpersonal":2614,"Ġbuilding":2615,"na":2616,"set":2617,"body":2618,"ĠBlack":2619,"Ġincrease":2620,"itten":2621,"Ġneeded":2622,"36":2623,"32":2624,"=\"":2625,"Ġlost":2626,"Ġbecame":2627,"Ġgroups":2628,"ĠMus":2629,"Ġwrote":2630,"ĠPe":2631,"Ġprop":2632,"joy":2633,"é":2634,"ĠWhite":2635,"Ġdead":2636,".'":2637,"Ġhttp":2638,"Ġwebs":2639,"OS":2640,"Ġinside":2641,"Ġwrong":2642,"Ġstatement":2643,"Ġ...":2644,"yl":2645,"Ġfilm":2646,"Ġmusic":2647,"Ġshare":2648,"ification":2649,"Ġrelease":2650,"Ġforward":2651,"Ġstay":2652,"Ġcomput":2653,"itte":2654,"ser":2655,"Ġoriginal":2656,"Ġcard":2657,"Ġcand":2658,"Ġdiv":2659,"atural":2660,"Ġfavor":2661,"OM":2662,"Ġcases":2663,"uses":2664,"Ġsection":2665,"Ġleave":2666,"ging":2667,"oved":2668,"ĠWashington":2669,"39":2670,"ĠGl":2671,"Ġrequired":2672,"action":2673,"apan":2674,"oor":2675,"iter":2676,"ĠKing":2677,"Ġcountries":2678,"ĠGerman":2679,"lling":2680,"Ġ27":2681,"34":2682,"Ġquestions":2683,"Ġprim":2684,"Ġcell":2685,"Ġshoot":2686,"Ġanyone":2687,"ĠWest":2688,"Ġaffect":2689,"epend":2690,"Ġonline":2691,"ĠIsrael":2692,"ĠSeptember":2693,"Ġability":2694,"Ġcontent":2695,"ises":2696,"Ġreve":2697,"Ġlaun":2698,"Ġindic":2699,"Ġforce":2700,"cast":2701,"Ġsold":2702,"aving":2703,"fl":2704,"Ġsoft":2705,"Ġcompanies":2706,"ceed":2707,"Ġarticle":2708,"Ġaud":2709,"Ġrev":2710,"Ġeduc":2711,"Ġplaying":2712,"05":2713,"Ġheld":2714,"ctor":2715,"Ġreleased":2716,"Ġfederal":2717,"37":2718,"Ġadminist":2719,"Ġinterview":2720,"Ġinstall":2721,"Ġreceived":2722,"Ġsource":2723,"uk":2724,"Ph":2725,"Ġserious":2726,"Ġcreated":2727,"Ġcause":2728,"Ġimmedi":2729,"Ġdefin":2730,"uel":2731,"ĠDepartment":2732,"ctions":2733,"ĠCour":2734,"ĠNow":2735,"ze":2736,"ites":2737,"itution":2738,"Ġlate":2739,"Ġspeak":2740,"ners":2741,"Ġlegal":2742,"ari":2743,"ĠCor":2744,"Ġweeks":2745,"Ġmodel":2746,"Ġpred":2747,"Ġexact":2748,"BC":2749,"ĠBy":2750,"ING":2751,"osing":2752,"Ġtakes":2753,"Ġregard":2754,"Ġopportun":2755,"Ġprice":2756,"Ġ198":2757,"ĠApr":2758,"fully":2759,"Ġord":2760,"Ġproblems":2761,"ruction":2762,"ham":2763,"ĠCount":2764,"lege":2765,"Ġleaders":2766,"ET":2767,"lev":2768,"Ġdeep":2769,"ological":2770,"ese":2771,"haps":2772,"ĠSome":2773,"Ġpers":2774,"Ġcontract":2775,"Ġrelationship":2776,"sp":2777,"oud":2778,"Ġbase":2779,"48":2780,"mit":2781,"Ad":2782,"ancial":2783,"Ġconsum":2784,"Ġpotential":2785,"Ġlangu":2786,"rem":2787,"eth":2788,"Ġrelig":2789,"ressed":2790,"66":2791,"Ġlink":2792,"Ġlower":2793,"ayer":2794,"ĠJune":2795,"Ġfem":2796,"unt":2797,"erc":2798,"urd":2799,"Ġcontact":2800,"Ġill":2801,"Ġmother":2802,"Ġestab":2803,"htt":2804,"ĠMarch":2805,"ĠBro":2806,"ĠChina":2807,"Ġ29":2808,"Ġsqu":2809,"Ġprovided":2810,"Ġaverage":2811,"asons":2812,"Ġ2011":2813,"Ġexam":2814,"lin":2815,"55":2816,"ned":2817,"Ġperfect":2818,"Ġtou":2819,"alse":2820,"ux":2821,"Ġbuy":2822,"Ġshot":2823,"Ġcollect":2824,"Ġphot":2825,"Ġplayed":2826,"Ġsurpr":2827,"Ġofficials":2828,"Ġsimple":2829,"avy":2830,"Ġindustry":2831,"Ġhands":2832,"ground":2833,"Ġpull":2834,"Ġround":2835,"Ġuser":2836,"Ġrange":2837,"uary":2838,"Ġprivate":2839,"ops":2840,"ees":2841,"Ġways":2842,"ĠMich":2843,"Ġveh":2844,"Ġexcept":2845,"Ġterms":2846,"imum":2847,"pper":2848,"ION":2849,"ores":2850,"ĠDragon":2851,"oul":2852,"Ġden":2853,"Ġperformance":2854,"Ġbill":2855,"cil":2856,"47":2857,"Ġenvironment":2858,"Ġexc":2859,"add":2860,"Ġworth":2861,"Ġpict":2862,"Ġchance":2863,"Ġ2018":2864,"bor":2865,"Ġspeed":2866,"iction":2867,"Ġalleg":2868,"ĠJapan":2869,"atory":2870,"reet":2871,"Ġmatch":2872,"ĠII":2873,"Ġstru":2874,"order":2875,"Ġste":2876,"Ġliving":2877,"Ġstruct":2878,"ino":2879,"Ġsepar":2880,"hern":2881,"Ġresponse":2882,"Ġenjoy":2883,"Ġvia":2884,"AD":2885,"uments":2886,"acebook":2887,"Ġmember":2888,"ibr":2889,"izing":2890,"Ġtool":2891,"ĠMon":2892,"ĠWhile":2893,"hood":2894,"ĠAng":2895,"ĠDef":2896,"Ġoffer":2897,"Tr":2898,"aur":2899,"Ġturned":2900,"ĠJuly":2901,"down":2902,"anced":2903,"Ġrecently":2904,"ĠEar":2905,"Ġce":2906,"ĠStar":2907,"ĠCong":2908,"rought":2909,"Ġblood":2910,"Ġhope":2911,"Ġcomment":2912,"aint":2913,"Ġarri":2914,"iles":2915,"Ġparticip":2916,"ought":2917,"ription":2918,"08":2919,"49":2920,"Ġgave":2921,"Ġselect":2922,"Ġkilled":2923,"sych":2924,"Ġgoes":2925,"ij":2926,"Ġcoll":2927,"Ġimpact":2928,"atives":2929,"ĠSer":2930,"09":2931,"ĠAugust":2932,"Ġboy":2933,"de":2934,"ĠDes":2935,"Ġfelt":2936,"US":2937,"Ġexpected":2938,"Ġimage":2939,"ĠMark":2940,"ccording":2941,"oice":2942,"EC":2943,"ĠMag":2944,"ened":2945,"hold":2946,"ĠPost":2947,"Ġprevent":2948,"No":2949,"Ġinvolved":2950,"Ġeyes":2951,"Ġquickly":2952,"At":2953,"unk":2954,"Ġbehav":2955,"Ġur":2956,"Ġled":2957,"come":2958,"ey":2959,"Ġcandid":2960,"Ġearlier":2961,"Ġfocus":2962,"ety":2963,"Pro":2964,"ledge":2965,"ixed":2966,"illed":2967,"Ġpopular":2968,"AP":2969,"Ġsett":2970,"light":2971,"Ġvarious":2972,"inks":2973,"Ġlevels":2974,"Ġroad":2975,"ellig":2976,"ables":2977,"hel":2978,"ittee":2979,"ĠGener":2980,"ype":2981,"Ġheard":2982,"icles":2983,"Ġmis":2984,"Ġusers":2985,"ĠSan":2986,"Ġimprove":2987,"Ġfather":2988,"Ġsearch":2989,"They":2990,"vil":2991,"Ġprofess":2992,"Ġknew":2993,"Ġloss":2994,"Ġevents":2995,"65":2996,"Ġbillion":2997,"07":2998,"02":2999,"ĠNews":3000,"ĠAM":3001,"Ġcover":3002,"where":3003,"ension":3004,"Ġbott":3005,"Ġareas":3006,"ences":3007,"ope":3008,"ĠTwitter":3009,"ael":3010,"Ġgets":3011,"ĠGoogle":3012,"Ġsn":3013,"iant":3014,"Ġvote":3015,"Ġnearly":3016,"Ġincluded":3017,"Ġrecogn":3018,"zz":3019,"mm":3020,"aled":3021,"Ġhappened":3022,"04":3023,"Ġhot":3024,"Ġwhose":3025,"Ġcivil":3026,"Ġsuff":3027,"oes":3028,"itiz":3029,"ĠSyri":3030,"Ġrespond":3031,"Ġhon":3032,"Ġfeatures":3033,"Ġeconomic":3034,"ĠApril":3035,"rim":3036,"Ġtechnology":3037,"Ġoption":3038,"aging":3039,"Ġpurch":3040,"Re":3041,"Ġlat":3042,"chie":3043,"isl":3044,"Ġrecomm":3045,"uf":3046,"Ġtraining":3047,"Ġeffects":3048,"Ġfast":3049,"Ġ2010":3050,"Ġoccur":3051,"Ġwebsite":3052,"Ġemail":3053,"Ġsens":3054,"ech":3055,"Ġoil":3056,"Ġinflu":3057,"Ġcurrently":3058,"ĠSch":3059,"ĠAdd":3060,"Ġgoal":3061,"Ġscient":3062,"Ġconv":3063,"100":3064,"emy":3065,"Ġdecided":3066,"Ġtravel":3067,"Ġmention":3068,"LL":3069,"03":3070,"Ġelection":3071,"Ġphone":3072,"Ġlooks":3073,"Ġsituation":3074,"Ġcy":3075,"Ġhor":3076,"bed":3077,"ĠCourt":3078,"aily":3079,"aves":3080,"Ġquality":3081,"ĠComp":3082,"wise":3083,"Ġtable":3084,"Ġstaff":3085,"ĠWind":3086,"ett":3087,"Ġtried":3088,"idered":3089,"Ġaddition":3090,"Ġbox":3091,"Ġlack":3092,"arily":3093,"Ġwide":3094,"Ġmid":3095,"Ġboard":3096,"ysis":3097,"Ġanti":3098,"ha":3099,"Ġdig":3100,"ening":3101,"Ġdro":3102,"Con":3103,"68":3104,"Ġslow":3105,"based":3106,"sequ":3107,"Ġpath":3108,"Ex":3109,"aker":3110,"Ġworked":3111,"Ġpen":3112,"Ġengine":3113,"Ġlooked":3114,"ĠSuper":3115,"ĠServ":3116,"Ġvictim":3117,"Un":3118,"Ġproperty":3119,"Ġintrodu":3120,"Ġexecut":3121,"ĠPM":3122,"Le":3123,"Ġcolor":3124,"ĠMore":3125,"Ġ60":3126,"Ġnetwork":3127,"Ġdate":3128,"cul":3129,"idge":3130,"Ġextra":3131,"31":3132,"Ġsle":3133,"67":3134,"Ġwond":3135,"Ġreports":3136,"just":3137,"ĠAustral":3138,"Ġcapital":3139,"Ġens":3140,"Ġcommand":3141,"Ġallowed":3142,"Ġprep":3143,"Ġcapt":3144,"hib":3145,"Ġnumbers":3146,"chan":3147,"Ġfair":3148,"mp":3149,"oms":3150,"Ġreach":3151,"With":3152,"tain":3153,"Ġbroad":3154,"Ġcouple":3155,"ecause":3156,"lying":3157,"ĠFeb":3158,"Ġscreen":3159,"Ġlives":3160,"Ġprior":3161,"ĠCongress":3162,"Ar":3163,"Ġapproach":3164,"Ġemer":3165,"aries":3166,"ĠDis":3167,"serv":3168,"ĠNe":3169,"Ġbuilt":3170,"cies":3171,"Ġrepe":3172,"Ġrules":3173,"force":3174,"ĠPal":3175,"Ġfinancial":3176,"Ġconsidered":3177,"ĠChar":3178,"nces":3179,"ĠIS":3180,"Ġbrought":3181,"Ġbi":3182,"iers":3183,"ĠSim":3184,"OP":3185,"Ġproducts":3186,"Ġvisit":3187,"Ġdocument":3188,"Ġconduct":3189,"Ġcompletely":3190,"ining":3191,"ĠCalif":3192,"ibly":3193,"Ġwritten":3194,"ĠTV":3195,"ements":3196,"Ġdraw":3197,"One":3198,"Ġpublished":3199,"Ġsecret":3200,"rain":3201,"het":3202,"ĠFacebook":3203,"onday":3204,"ĠUp":3205,"Ġsexual":3206,"Ġthous":3207,"ĠPat":3208,"Ġess":3209,"Ġstandard":3210,"Ġarm":3211,"ges":3212,"ection":3213,"Ġfell":3214,"Ġforeign":3215,"ani":3216,"ĠFriday":3217,"Ġregular":3218,"inary":3219,"Ġincreased":3220,"Ġusually":3221,"Ġdemon":3222,"Ġdark":3223,"Ġadditional":3224,"rol":3225,"ĠOf":3226,"Ġproduction":3227,"!!":3228,"undred":3229,"Ġinternational":3230,"idents":3231,"ĠFree":3232,"roup":3233,"Ġrace":3234,"Ġmach":3235,"Ġhuge":3236,"All":3237,"lear":3238,"ovember":3239,"Ġtown":3240,"Ġattention":3241,"ĠOff":3242,"yond":3243,"ĠThen":3244,"field":3245,"Ġterror":3246,"raz":3247,"ĠBo":3248,"Ġmeeting":3249,"ĠPark":3250,"Ġarrest":3251,"Ġfear":3252,"Ġaw":3253,"ĠVal":3254,"oring":3255,"',":3256,"Ġextreme":3257,"arr":3258,"Ġworkers":3259,"After":3260,"Ġ31":3261,"net":3262,"ament":3263,"Ġdirectly":3264,"Ġpopulation":3265,"ube":3266,"ĠOctober":3267,"ĠIN":3268,"ĠJanuary":3269,"59":3270,"ĠDavid":3271,"Ġcross":3272,"cember":3273,"ĠFirst":3274,"Ġmessage":3275,"irit":3276,"Ġnation":3277,"Ġpoll":3278,"isions":3279,"Ġanswer":3280,"ny":3281,"isode":3282,"Ġcarry":3283,"ĠRussia":3284,"Ġhear":3285,"ength":3286,"roy":3287,"Ġnatural":3288,"inally":3289,"Ġdog":3290,"mitted":3291,"Ġtrade":3292,"Ġsubst":3293,"Ġmultiple":3294,"ĠAfric":3295,"Ġfans":3296,"Ġsort":3297,"Ġglobal":3298,"ication":3299,"ĠWed":3300,"ara":3301,"Ġachie":3302,"Ġlanguage":3303,"vey":3304,"Ġtal":3305,"Ġnecessary":3306,"Ġdetails":3307,"Ġsen":3308,"ĠSund":3309,"ĠReg":3310,"ĠRec":3311,"06":3312,"Ġsil":3313,"ressive":3314,"Ġmedical":3315,"unch":3316,"ornia":3317,"Ġund":3318,"fort":3319,"ocks":3320,"ĠMonday":3321,"uesday":3322,"craft":3323,"77":3324,"urt":3325,"Ġver":3326,"ĠHill":3327,"Ġreceive":3328,"Ġmorning":3329,"estern":3330,"Ġbank":3331,"Ġsat":3332,"irth":3333,"ĠHigh":3334,"Ġdevice":3335,"ĠTHE":3336,"ĠCenter":3337,"Ġsafe":3338,"Ġple":3339,"ĠCanada":3340,"Ġsystems":3341,"Ġassist":3342,"Ġsurv":3343,"Ġbattle":3344,"ĠSoc":3345,"vertis":3346,"She":3347,"Ġpaper":3348,"Ġgrowth":3349,"Ġcast":3350,"Sc":3351,"Ġplans":3352,"lled":3353,"Ġparts":3354,"Ġwall":3355,"Ġmovement":3356,"Ġpractice":3357,"imately":3358,"Ġdisplay":3359,"Ġsometimes":3360,"omp":3361,"ĠPaul":3362,"ĠYes":3363,"king":3364,"58":3365,"oly":3366,"Ġson":3367,"Ġavoid":3368,"okes":3369,"ĠJew":3370,"Ġtowards":3371,"asc":3372,"Ġ//":3373,"ĠKore":3374,"Ġtalking":3375,"Ġcorrect":3376,"Ġspent":3377,"icks":3378,"iable":3379,"eared":3380,"Ġterm":3381,"Ġwants":3382,"oming":3383,"Ġut":3384,"Ġdoub":3385,"Ġforces":3386,"Ġplease":3387,"69":3388,"ĠNovember":3389,"atform":3390,"ondon":3391,"Ġones":3392,"Ġimmediately":3393,"ĠRussian":3394,"ĠMet":3395,"Ġdeg":3396,"Ġparents":3397,"CH":3398,"ĠAmericans":3399,"aly":3400,"ĠMod":3401,"Ġshown":3402,"Ġconditions":3403,"Ġstuff":3404,"Ġreb":3405,"ĠYour":3406,"Ġincludes":3407,"nown":3408,"ĠSam":3409,"Ġexperien":3410,"mission":3411,"ĠEven":3412,"aught":3413,"Ġannounced":3414,"ĠRepublican":3415,"Ġdetermin":3416,"Ġdescribed":3417,"ĠCounty":3418,"()":3419,"Ġdoor":3420,"Ġchanged":3421,"Ġneigh":3422,"ĠHere":3423,"Ġclean":3424,"Ġpan":3425,"ĠDecember":3426,"ĠEuropean":3427,"iring":3428,"apter":3429,"Ġclub":3430,"ĠTuesday":3431,"Ġpaid":3432,"ĠNet":3433,"Ġattacks":3434,"Ġcharacters":3435,"Ġalone":3436,"Ġdirector":3437,"dom":3438,"Ġ35":3439,"Ġload":3440,"Ġrout":3441,"ĠCalifornia":3442,"Ġfinally":3443,"Ġrac":3444,"Ġcontr":3445,"Ġexactly":3446,"resh":3447,"pri":3448,"ĠIslam":3449,"Ġnature":3450,"Ġcareer":3451,"Ġlatest":3452,"Ġconvers":3453,"ĠSl":3454,"pose":3455,"cient":3456,"ĠInc":3457,"ivity":3458,"88":3459,"ĠAtt":3460,"ĠMor":3461,"nesday":3462,"Ġweight":3463,"ken":3464,"Ġnote":3465,"Ġteams":3466,"Ġ\\":3467,"airs":3468,"ĠGreen":3469,"Ġhundred":3470,"onent":3471,"Ġstreng":3472,"Ġconsist":3473,"icated":3474,"Ġregul":3475,"Ġlic":3476,"astic":3477,"Ġten":3478,"ursday":3479,"elligence":3480,"ously":3481,"ĠUK":3482,"BI":3483,"Ġcosts":3484,"Ġindepend":3485,"ĠAP":3486,"Ġnormal":3487,"Ġhom":3488,"Ġobvious":3489,"Ġswe":3490,"Ġstar":3491,"Ġready":3492,"acher":3493,"Ġimplement":3494,"gest":3495,"Ġsong":3496,"ĠGet":3497,"ĠLab":3498,"Ġinteresting":3499,"using":3500,"Ġgiving":3501,"ĠSunday":3502,"Ġetc":3503,"Ġmiddle":3504,"Ġremember":3505,"right":3506,"osition":3507,"utions":3508,"Ġmax":3509,"46":3510,"Ġyourself":3511,"Ġdemand":3512,"Ġtreatment":3513,"Ġdanger":3514,"ĠCons":3515,"Ġguy":3516,"ĠBritish":3517,"Ġphysical":3518,"Ġrelated":3519,"Ġremain":3520,"Ġcouldn":3521,"Ġrefer":3522,"Ġcitiz":3523,"box":3524,"ENT":3525,"board":3526,"Ġinn":3527,"IG":3528,"ero":3529,"ĠStreet":3530,"ospital":3531,"rench":3532,"chers":3533,"Ġstra":3534,"OL":3535,"ager":3536,"ĠAN":3537,"Ġeasily":3538,"IA":3539,"enge":3540,"iny":3541,"Ġclos":3542,"ocked":3543,"Ġuses":3544,"ĠCoun":3545,"Im":3546,"uild":3547,"??":3548,"more":3549,"Ġang":3550,"Ġwrite":3551,"olute":3552,"57":3553,"Ġleader":3554,"Ġreading":3555,"":3784,"Ġfigure":3785,"Ġdisapp":3786,"enty":3787,"Ġsoftware":3788,"Ġult":3789,"Ġofficers":3790,"New":3791,"Is":3792,"Ġremains":3793,"ĠIndia":3794,"Ġpsych":3795,"rief":3796,"Ġcat":3797,"esc":3798,"Ġobserv":3799,"Ġstage":3800,"ĠDark":3801,"Ġenter":3802,"change":3803,"Ġpassed":3804,"Ġdespite":3805,"ĠOut":3806,"Ġmovie":3807,"rs":3808,"Ġvoice":3809,"mine":3810,"ĠPlay":3811,"Ġtoward":3812,"ĠTer":3813,"Ġregion":3814,"Ġvalues":3815,"orters":3816,"Ġmount":3817,"Ġofficer":3818,"ĠOther":3819,"ban":3820,"Ġhous":3821,"wood":3822,"room":3823,"IV":3824,"ĠSun":3825,"see":3826,"ĠOver":3827,"rog":3828,"90":3829,"Ġlay":3830,"ĠTur":3831,"awn":3832,"Ġpressure":3833,"ĠSub":3834,"Ġbooks":3835,"edom":3836,"ĠSand":3837,"AA":3838,"ago":3839,"Ġreasons":3840,"ford":3841,"Ġactivity":3842,"UT":3843,"Now":3844,"ĠSenate":3845,"cell":3846,"night":3847,"Ġcalls":3848,"inter":3849,"Ġletter":3850,"ĠRob":3851,"ĠJe":3852,"Ġchoose":3853,"ĠLaw":3854,"Get":3855,"Be":3856,"Ġrob":3857,"Ġtypes":3858,"Ġplatform":3859,"Ġquarter":3860,"RA":3861,"ĠTime":3862,"Ġmaybe":3863,"ĠCr":3864,"95":3865,"pre":3866,"Ġmoving":3867,"Ġlif":3868,"Ġgold":3869,"Ġsom":3870,"Ġpatients":3871,"Ġtruth":3872,"ĠKe":3873,"urance":3874,"antly":3875,"mar":3876,"Ġcharge":3877,"ĠGreat":3878,"Ġcele":3879,"--------------------------------":3880,"Ġrock":3881,"roid":3882,"ancy":3883,"Ġcredit":3884,"aud":3885,"By":3886,"ĠEvery":3887,"Ġmoved":3888,"inger":3889,"ribution":3890,"Ġnames":3891,"Ġstraight":3892,"ĠHealth":3893,"ĠWell":3894,"Ġfeature":3895,"Ġrule":3896,"Ġsche":3897,"inated":3898,"ĠMichael":3899,"berg":3900,"41":3901,"iled":3902,"band":3903,"Ġclick":3904,"ĠAngel":3905,"onents":3906,"ÂŃ":3907,"ĠIraq":3908,"ĠSaturday":3909,"Ġaware":3910,"part":3911,"Ġpattern":3912,"OW":3913,"ĠLet":3914,"Ġgrad":3915,"igned":3916,"Ġassociated":3917,"Ġstyle":3918,"no":3919,"iation":3920,"aith":3921,"ilies":3922,"Ġstories":3923,"uration":3924,"Ġindividuals":3925,"ĠâĢ¦":3926,"miss":3927,"ĠAssoci":3928,"ishing":3929,"aby":3930,"Ġsummer":3931,"ĠBen":3932,"Ġ32":3933,"Ġarch":3934,"uty":3935,"ĠTexas":3936,"hol":3937,"Ġfully":3938,"Ġmill":3939,"Ġfollowed":3940,"ĠBill":3941,"ĠIndian":3942,"ĠSecret":3943,"ĠBel":3944,"ĠFebruary":3945,"Ġjobs":3946,"Ġseemed":3947,"ĠGovern":3948,"ipped":3949,"Ġreality":3950,"Ġlines":3951,"Ġpark":3952,"Ġmeasure":3953,"ĠOur":3954,"IM":3955,"Ġbrother":3956,"Ġgrowing":3957,"Ġban":3958,"Ġestim":3959,"Ġcry":3960,"ĠSchool":3961,"Ġmechan":3962,"ĠOF":3963,"ĠWindows":3964,"Ġrates":3965,"ĠOh":3966,"Ġpositive":3967,"Ġculture":3968,"istics":3969,"ica":3970,"Ġhar":3971,"ya":3972,"itely":3973,"ipp":3974,"Ġmap":3975,"encies":3976,"ĠWilliam":3977,"II":3978,"akers":3979,"56":3980,"ĠMart":3981,"ĠRem":3982,"Ġaltern":3983,"itude":3984,"Ġcoach":3985,"rowd":3986,"Don":3987,"Ġkids":3988,"Ġjournal":3989,"Ġcorpor":3990,"Ġfalse":3991,"Ġweb":3992,"Ġsleep":3993,"Ġcontain":3994,"Ġsto":3995,"Ġbed":3996,"iverse":3997,"ĠRich":3998,"ĠChinese":3999,"Ġpun":4000,"Ġmeant":4001,"known":4002,"Ġnotice":4003,"Ġfavorite":4004,"aven":4005,"Ġcondition":4006,"Ġpurpose":4007,"))":4008,"Ġorganization":4009,"Ġchalleng":4010,"Ġmanufact":4011,"Ġsusp":4012,"ĠAc":4013,"Ġcritic":4014,"unes":4015,"uclear":4016,"Ġmer":4017,"vention":4018,"Ġ80":4019,"Ġmist":4020,"ĠUs":4021,"ĠTor":4022,"http":4023,"olf":4024,"Ġlarger":4025,"Ġadvant":4026,"Ġresear":4027,"Ġactions":4028,"ml":4029,"Ġkept":4030,"Ġaim":4031,",'":4032,"col":4033,"Ġbenefits":4034,"ifying":4035,"Ġactual":4036,"ĠInternational":4037,"Ġvehicle":4038,"Ġchief":4039,"Ġefforts":4040,"ĠLeague":4041,"ĠMost":4042,"Ġwait":4043,"Ġadult":4044,"Ġoverall":4045,"Ġspeech":4046,"Ġhighly":4047,"Ġfemale":4048,"Ġerror":4049,"Ġeffective":4050,"54":4051,"Ġencour":4052,"well":4053,"Ġfailed":4054,"Ġconserv":4055,"Ġprograms":4056,"Ġtrou":4057,"Ġahead":4058,"500":4059,"vertisement":4060,"IP":4061,"ĠFound":4062,"pir":4063,"Ġ%":4064,"Ġcrime":4065,"ander":4066,"Ġlocation":4067,"ĠIran":4068,"Ġbehavior":4069,"azing":4070,"Ġrare":4071,"Ġemb":4072,"Ġcaused":4073,"Ġship":4074,"Ġactive":4075,"Ġcontribut":4076,"Ġgreen":4077,"Ġacqu":4078,"Ġreflect":4079,"venue":4080,"Ġfirm":4081,"Ġbirth":4082,"].":4083,"Ġclearly":4084,"Ġemot":4085,"Ġagency":4086,"riage":4087,"Ġmemory":4088,"98":4089,"SA":4090,"ĠSee":4091,"acing":4092,"CC":4093,"Ġbiggest":4094,"Ġrap":4095,"Ġbasic":4096,"Ġband":4097,"eat":4098,"Ġsuspect":4099,"ĠMac":4100,"Ġ90":4101,"mark":4102,"istan":4103,"Ġspread":4104,"ams":4105,"ki":4106,"asy":4107,"rav":4108,"ĠRober":4109,"Ġdemonstr":4110,"rated":4111,"Ġabsolute":4112,"Ġplaces":4113,"Ġimpl":4114,"ibrary":4115,"Ġcards":4116,"Ġdestroy":4117,"Ġvirt":4118,"vere":4119,"Ġappeared":4120,"yan":4121,"point":4122,"Ġbeg":4123,"Ġtemper":4124,"spe":4125,"anted":4126,"ears":4127,"ĠDirect":4128,"Ġlength":4129,"Ġblog":4130,"amb":4131,"Ġinteg":4132,"Ġresources":4133,"acc":4134,"iful":4135,"Ġspot":4136,"Ġforced":4137,"Ġthousands":4138,"ĠMinister":4139,"Ġqual":4140,"ĠFrench":4141,"atically":4142,"Ġgenerally":4143,"Ġdrink":4144,"Ġthus":4145,"IL":4146,"odes":4147,"Ġappropri":4148,"ĠRead":4149,"Ġwhom":4150,"Ġeye":4151,"Ġcollege":4152,"Ġ45":4153,"irection":4154,"Ġensure":4155,"Ġapparent":4156,"iders":4157,"Ġreligious":4158,"Ġminor":4159,"olic":4160,"Ġtro":4161,"ĠWhy":4162,"ribute":4163,"met":4164,"Ġprimary":4165,"Ġdeveloped":4166,"Ġpeace":4167,"Ġskin":4168,"ste":4169,"ava":4170,"Ġblue":4171,"Ġfamilies":4172,"Ġir":4173,"Ġapply":4174,"Ġinform":4175,"ĠSmith":4176,"CT":4177,"ii":4178,"Ġlimit":4179,"Ġresist":4180,"................":4181,"umn":4182,"Ġconflic":4183,"Ġtwe":4184,"udd":4185,"ĠTom":4186,"Ġliter":4187,"que":4188,"bon":4189,"Ġhair":4190,"Ġeventually":4191,"Ġpus":4192,"Ġhelped":4193,"Ġagg":4194,"orney":4195,"ĠApple":4196,"Ġfit":4197,"ĠSur":4198,"Ġprem":4199,"Ġsales":4200,"Ġseconds":4201,"Ġstrength":4202,"Ġfeeling":4203,"¿½":4204,"Ġtour":4205,"Ġknows":4206,"oom":4207,"Ġexerc":4208,"Ġsomew":4209,"�":4210,">>":4211,"Ġspokes":4212,"Ġideas":4213,"Ġregist":4214,"soft":4215,"ĠDel":4216,"ĠPC":4217,"Ġpropos":4218,"Ġlaunch":4219,"Ġbottom":4220,"TH":4221,"ĠPlease":4222,"vest":4223,"itz":4224,"ĠInter":4225,"Ġscript":4226,"Ġrat":4227,"arning":4228,"Ġil":4229,"ĠJer":4230,"ĠAre":4231,"Ġwhatever":4232,"oken":4233,"cience":4234,"Ġmode":4235,"Ġagree":4236,"Ġsources":4237,"Ġinitial":4238,"Ġrestrict":4239,"Ġwonder":4240,"usion":4241,"####":4242,"ĠSil":4243,"ville":4244,"Ġburn":4245,"tw":4246,"asion":4247,"Ġ£":4248,"Ġnor":4249,"uing":4250,"Ġreached":4251,"Ġsun":4252,"Ġcateg":4253,"igration":4254,"Ġcook":4255,"Ġpromot":4256,"Ġmale":4257,"Ġclimate":4258,"Ġfix":4259,"Ġalleged":4260,"UR":4261,"alled":4262,"Ġimages":4263,"Cont":4264,"ota":4265,"Ġschools":4266,"ios":4267,"Ġdrop":4268,"Ġstream":4269,"ĠMo":4270,"Ġpreviously":4271,"aling":4272,"Ġpet":4273,"Ġdouble":4274,"Ġ(@":4275,"annel":4276,"Ġdefault":4277,"ties":4278,"Ġrank":4279,"ĠDec":4280,"ĠCouncil":4281,"Ġweapon":4282,"Ġstock":4283,"Ġanaly":4284,"ĠStr":4285,"Ġpicture":4286,"ĠPolice":4287,"ference":4288,"Ġcentury":4289,"Ġcitizens":4290,"Ġonto":4291,"Ġexpand":4292,"Ġhero":4293,"ĠSol":4294,"Ġwild":4295,"Ġupdate":4296,"Ġcustomers":4297,"ront":4298,"def":4299,"Ġlik":4300,"Ġcriminal":4301,"ĠChristian":4302,"SP":4303,"76":4304,"Ġleaving":4305,"Ġotherwise":4306,"ĠDist":4307,"Ġbasis":4308,"52":4309,"53":4310,"icip":4311,"ĠBer":4312,"Ġrecommend":4313,"Ġfloor":4314,"Ġcrowd":4315,"oles":4316,"Ġ70":4317,"Ġcentral":4318,"ĠEv":4319,"Ġdream":4320,"Ġdownload":4321,"Ġconfir":4322,"ĠThom":4323,"Ġwindow":4324,"Ġhappens":4325,"Ġunit":4326,"Ġtend":4327,"Ġspl":4328,"Ġbecomes":4329,"Ġfighting":4330,"Ġpredict":4331,"ĠPress":4332,"ĠPower":4333,"Ġheavy":4334,"aked":4335,"Ġfan":4336,"orter":4337,"ategy":4338,"BA":4339,"izes":4340,"Ġspend":4341,"Here":4342,"Ġ2007":4343,"Ġadop":4344,"ĠHam":4345,"Ġfootball":4346,"ĠPort":4347,"oday":4348,"51":4349,"ampions":4350,"Ġtransfer":4351,"ht":4352,"Ġ38":4353,"term":4354,"acity":4355,"Ġbur":4356,"],":4357,"ternal":4358,"rig":4359,"but":4360,"Ġtherefore":4361,"ĠBecause":4362,"resp":4363,"rey":4364,"Ġmission":4365,"Some":4366,"Ġnoted":4367,"Ġassum":4368,"Ġdisease":4369,"Ġedit":4370,"Ġprogress":4371,"rd":4372,"ĠBrown":4373,"ocal":4374,"Ġadding":4375,"Ġraised":4376,"ĠAny":4377,"Ġtick":4378,"Ġseeing":4379,"ĠPeople":4380,"Ġagreement":4381,"Ġserver":4382,"Ġwat":4383,"Ġdebate":4384,"Ġsupposed":4385,"iling":4386,"Ġlargest":4387,"Ġsuccessful":4388,"ĠPri":4389,"ĠDemocratic":4390,"Ġjump":4391,"ĠSyria":4392,"Ġowners":4393,"Ġoffers":4394,"Ġshooting":4395,"Ġeffic":4396,"sey":4397,"Ġhaven":4398,"verse":4399,"tered":4400,"ĠLight":4401,"imal":4402,"ĠBig":4403,"Ġdefend":4404,"Ġbeat":4405,"Ġrecords":4406,"%)":4407,"Ġscen":4408,"Ġemployees":4409,"Ġdevices":4410,"hem":4411,"Ġcommer":4412,"ĠMex":4413,"Ġbenefit":4414,"ĠProf":4415,"Ġilleg":4416,"Ġsurface":4417,"ĠAlso":4418,"Ġharm":4419,"ingly":4420,"wide":4421,"ĠAlex":4422,"Ġshut":4423,"ĠCur":4424,"Ġlose":4425,"pm":4426,"Ġchallenge":4427,"semb":4428,"Ġstation":4429,"Ġintelligence":4430,"Ġaccur":4431,"ĠFlor":4432,"Ġrequires":4433,"ĠMal":4434,"bum":4435,"Ġhospital":4436,"Ġspirit":4437,"Ġoffered":4438,"Ġproduce":4439,"ĠCommun":4440,"Ġcreating":4441,"Ġcris":4442,"spect":4443,"Ġended":4444,"Ġdaily":4445,"Ġvoters":4446,"lands":4447,"ias":4448,"ih":4449,"ona":4450,"Ġsmart":4451,"ĠOffice":4452,"ĠLord":4453,"rial":4454,"ĠInternet":4455,"Ġcircum":4456,"Ġextremely":4457,"'.":4458,"Ġopinion":4459,"ĠMil":4460,"Ġgain":4461,"BS":4462,"ĠFin":4463,"yp":4464,"Ġuseful":4465,"Ġbudget":4466,"Ġcomfort":4467,"isf":4468,"Ġbackground":4469,"eline":4470,"Ġepisode":4471,"Ġenemy":4472,"Ġtrial":4473,"Ġestablish":4474,"date":4475,"ĠCap":4476,"Ġcontinues":4477,"Ġshowing":4478,"ĠUnion":4479,"with":4480,"Ġposted":4481,"ĠSystem":4482,"Ġeat":4483,"rian":4484,"Ġrise":4485,"ĠGermany":4486,"ils":4487,"Ġsigned":4488,"Ġvill":4489,"Ġgrand":4490,"mor":4491,"ĠEngland":4492,"Ġprojects":4493,"umber":4494,"Ġconference":4495,"za":4496,"Ġresponsible":4497,"ĠArab":4498,"Ġlearned":4499,"âĢĶâĢĶ":4500,"ipping":4501,"ĠGeorge":4502,"OC":4503,"Ġreturned":4504,"ĠAustralia":4505,"Ġbrief":4506,"Qu":4507,"Ġbrand":4508,"illing":4509,"abled":4510,"Ġhighest":4511,"Ġtrain":4512,"ĠCommission":4513,"while":4514,"Ġnom":4515,"ception":4516,"Ġmut":4517,"ĠBlue":4518,"Ġincident":4519,"vant":4520,"86":4521,"ĠID":4522,"Ġnuclear":4523,"74":4524,"ĠLike":4525,"ĠRE":4526,"ĠMicro":4527,"li":4528,"mail":4529,"Ġcharges":4530,"89":4531,"Ġadjust":4532,"ado":4533,"Ġearth":4534,"NA":4535,"Ġprices":4536,"PA":4537,"Ġdraft":4538,"Ġruns":4539,"Ġcandidate":4540,"enses":4541,"Ġmanagement":4542,"ĠPhil":4543,"ĠMiss":4544,"Ġteach":4545,"gram":4546,"Ġunderstanding":4547,"ait":4548,"icago":4549,"Add":4550,"ĠEp":4551,"secut":4552,"Ġseparate":4553,"Ġinstance":4554,"Ġeth":4555,"Ġunless":4556,"********":4557,"ĠFore":4558,"inate":4559,"Ġoperations":4560,"Sp":4561,"Ġfaith":4562,"gar":4563,"ĠChurch":4564,"ronic":4565,"Ġconfig":4566,"osure":4567,"Ġactivities":4568,"Ġtraditional":4569,"Ġ36":4570,"Ġdirection":4571,"Ġmachine":4572,"Ġsurround":4573,"Ġpush":4574,"unction":4575,"ĠEU":4576,"Ġeasier":4577,"Ġargument":4578,"GB":4579,"Ġmicro":4580,"Ġspending":4581,"izations":4582,"Ġtheory":4583,"adow":4584,"Ġcalling":4585,"ĠLast":4586,"Ġder":4587,"Ġinfluence":4588,"Ġcommit":4589,"Ġphoto":4590,"Ġunc":4591,"istry":4592,"gn":4593,"aste":4594,"acks":4595,"Ġdisp":4596,"ady":4597,"do":4598,"ĠGood":4599,"Ġ`":4600,"Ġwish":4601,"Ġrevealed":4602,"³³":4603,"lig":4604,"Ġenforce":4605,"ĠCommittee":4606,"Ġchem":4607,"Ġmiles":4608,"Ġinterested":4609,"Ġsolution":4610,"icy":4611,"inct":4612,"Ġ->":4613,"ĠDet":4614,"Ġremoved":4615,"Ġcompar":4616,"eah":4617,"Ġplant":4618,"ĠSince":4619,"Ġachieve":4620,"Ġadvantage":4621,"Ġslightly":4622,"bing":4623,"Ġplaced":4624,"under":4625,"2015":4626,"ĠMad":4627,"Ġtim":4628,"oses":4629,"Ġcru":4630,"ĠRock":4631,"Ġmostly":4632,"Ġnegative":4633,"Ġsetting":4634,"Ġproduced":4635,"Ġmur":4636,"Ġconnection":4637,"ĠMer":4638,"Ġdriver":4639,"Ġexecutive":4640,"Ġassault":4641,"Ġborn":4642,"ĠVer":4643,"tained":4644,"Ġstructure":4645,"Ġreduce":4646,"Ġdecades":4647,"Ġded":4648,"uke":4649,"ĠMany":4650,"idden":4651,"Ġleague":4652,"Se":4653,"Ġjoin":4654,"Ġdisco":4655,"Ġdie":4656,"cks":4657,"actions":4658,"Ġassess":4659,"agn":4660,"Ġgoals":4661,"ours":4662,"IR":4663,"Ġsenior":4664,"iller":4665,"mod":4666,"ipment":4667,"ocol":4668,"uy":4669,"ĠQue":4670,"Ġparties":4671,"irgin":4672,"Ġlearning":4673,"itable":4674,"Ġstreet":4675,"Ġcamera":4676,"App":4677,"Ġskills":4678,"bre":4679,"cious":4680,"Ġcelebr":4681,"ĠFranc":4682,"Ġexisting":4683,"Ġwilling":4684,"lor":4685,"Ġid":4686,"ĠSpace":4687,"Ġcritical":4688,"ĠLa":4689,"ortunately":4690,"Ġserve":4691,"Ġcold":4692,"Ġspecies":4693,"TS":4694,"Ġanimals":4695,"ĠBay":4696,"Ġolder":4697,"ĠUnder":4698,"estic":4699,"ĠTre":4700,"Ġteacher":4701,"Ġprefer":4702,"vis":4703,"Ġthread":4704,"ĠMatt":4705,"Ġmanager":4706,"ãĥ»":4707,"Ġprofessional":4708,"ĠVol":4709,"Ġnotes":4710,"These":4711,"ula":4712,"Ġfresh":4713,"ented":4714,"uzz":4715,"edy":4716,"clusion":4717,"ĠRel":4718,"Ġdoubt":4719,"EO":4720,"Ġopened":4721,"ĠBit":4722,"Advertisement":4723,"Ġguess":4724,"ĠUN":4725,"Ġsequ":4726,"Ġexplain":4727,"otten":4728,"Ġattract":4729,"aks":4730,"Ġstring":4731,"Ġcontext":4732,"ossible":4733,"ĠRepublicans":4734,"Ġsolid":4735,"Ġcities":4736,"Ġasking":4737,"Ġrandom":4738,"ups":4739,"uries":4740,"arant":4741,"dden":4742,"gl":4743,"ĠFlorida":4744,"Ġdepend":4745,"ĠScott":4746,"Ġ33":4747,"ĠiT":4748,"icon":4749,"Ġmentioned":4750,"Ġ2000":4751,"Ġclaimed":4752,"Ġdefinitely":4753,"ulf":4754,"Ġcore":4755,"Ġopening":4756,"ĠConst":4757,"which":4758,"ĠTra":4759,"AG":4760,"72":4761,"Ġbelieved":4762,"ada":4763,"Ġ48":4764,"ĠSecurity":4765,"yright":4766,"ĠPet":4767,"ĠLou":4768,"Ġholding":4769,"================":4770,"Ġice":4771,"Ġbrow":4772,"Ġauthorities":4773,"host":4774,"word":4775,"Ġscore":4776,"ĠDiv":4777,"Ġcells":4778,"Ġtransl":4779,"Ġneighbor":4780,"Ġremove":4781,"uct":4782,"Ġdistrict":4783,"ĠAccording":4784,"Ġworse":4785,"Ġconcerns":4786,"Ġpresidential":4787,"Ġpolicies":4788,"ĠHall":4789,"73":4790,"Ġhus":4791,"AY":4792,"Ġ2006":4793,"ĠJud":4794,"Ġindependent":4795,"ĠJustice":4796,"iliar":4797,"print":4798,"ighter":4799,"Ġprotection":4800,"zen":4801,"Ġsudden":4802,"house":4803,"ĠJes":4804,"PR":4805,"ĠInf":4806,"Ġbul":4807,"Ġ_":4808,"ĠService":4809,"ĠPR":4810,"Ġstrategy":4811,"ffect":4812,"Ġgirls":4813,"Ġmissing":4814,"oyal":4815,"ĠTeam":4816,"ulated":4817,"Ġdat":4818,"Ġpolitics":4819,"abor":4820,"According":4821,"Ġspell":4822,"Ġgraph":4823,"orthern":4824,"TC":4825,"Ab":4826,"Ġlabor":4827,"isher":4828,"Ġkick":4829,"ĠiTunes":4830,"Ġsteps":4831,"poses":4832,"Ġsmaller":4833,"En":4834,"bert":4835,"Ġroll":4836,"Ġresearchers":4837,"Ġclosed":4838,"Ġtransport":4839,"Ġlawy":4840,"________________":4841,"ĠChicago":4842,"Ġaspect":4843,"Ġnone":4844,"Ġmarriage":4845,"96":4846,"Ġelements":4847,"ĠFre":4848,"ĠSal":4849,"Ġdram":4850,"FC":4851,"top":4852,"equ":4853,"Ġhearing":4854,"Ġsupported":4855,"Ġtesting":4856,"cohol":4857,"Ġmassive":4858,"Ġstick":4859,"Ġguard":4860,"isco":4861,"phone":4862,"From":4863,"However":4864,"Ġborder":4865,"Ġcopy":4866,"ography":4867,"list":4868,"71":4869,"Ġowner":4870,"class":4871,"ruit":4872,"rate":4873,"ĠOnce":4874,"Ġdigital":4875,"Ġtask":4876,"ERS":4877,"Ġincred":4878,"tes":4879,"++":4880,"ĠFrance":4881,"Ġbreat":4882,"owl":4883,"Ġissued":4884,"ĠWestern":4885,"Ġdetect":4886,"Ġpartners":4887,"Ġshared":4888,"ĠCall":4889,"Ġcancer":4890,"ache":4891,"ribe":4892,"Ġexplained":4893,"Ġheat":4894,"{\"":4895,"Ġinvestment":4896,"ĠBook":4897,"Ġwood":4898,"Ġtools":4899,"ĠAlthough":4900,"Ġbelief":4901,"Ġcrisis":4902,"Ġge":4903,"ĠMP":4904,"Ġoperation":4905,"type":4906,"~~":4907,"ga":4908,"Ġcontains":4909,"anta":4910,"Ġexpress":4911,"ĠGroup":4912,"ĠJournal":4913,"ka":4914,"Ġamb":4915,"ĠUSA":4916,"Ġfinding":4917,"Ġfunding":4918,"how":4919,"Ġestablished":4920,"ideos":4921,"Ġdegree":4922,"Ġdangerous":4923,"anging":4924,"Ġfreedom":4925,"pport":4926,"outhern":4927,"Ġchurch":4928,"Ġcatch":4929,"ĠTwo":4930,"Ġpresence":4931,"ĠGuard":4932,"Up":4933,"Ġauthority":4934,"ĠProject":4935,"Ġbutton":4936,"Ġconsequ":4937,"Ġvalid":4938,"Ġweak":4939,"Ġstarts":4940,"Ġreference":4941,"ĠMem":4942,"\")":4943,"UN":4944,"orage":4945,"ĠOpen":4946,"Ġcollection":4947,"ym":4948,"gency":4949,"Ġbeautiful":4950,"ros":4951,"Ġtells":4952,"Ġwaiting":4953,"nel":4954,"Ġproviding":4955,"ĠDemocrats":4956,"Ġdaughter":4957,"Ġmaster":4958,"Ġpurposes":4959,"ĠJapanese":4960,"Ġequal":4961,"Ġturns":4962,"Ġdocuments":4963,"Ġwatching":4964,"Res":4965,"Ġran":4966,"2014":4967,"Ġreject":4968,"ĠKorea":4969,"Ġvictims":4970,"Level":4971,"erences":4972,"Ġwitness":4973,"Ġ34":4974,"Ġreform":4975,"coming":4976,"Ġoccup":4977,"Ġcaught":4978,"Ġtraffic":4979,"ading":4980,"Ġmodels":4981,"ario":4982,"Ġserved":4983,"Ġbatter":4984,"uate":4985,"ĠSecretary":4986,"Ġagreed":4987,"Ġtruly":4988,"ynam":4989,"ĠRet":4990,"Ġunits":4991,"ĠResearch":4992,"hand":4993,"azine":4994,"ĠMike":4995,"Ġvariety":4996,"otal":4997,"Ġamazing":4998,"<|endoftext|>":4999} \ No newline at end of file diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index f89cd773c..959c9343f 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -6,6 +6,26 @@ # from megatron.data.gpt_dataset import build_train_valid_test_datasets from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets +from datasets import load_dataset + +import torch.distributed as dist + +# To preprocess data before testing +# python tools/preprocess_data.py \ +# --input tests/data/t0/ag_news_classify_question_first.json \ +# --output-prefix tests/data/t0/ag_news_prompt \ +# --dataset-impl mmap \ +# --tokenizer-type PretrainedFromHF \ +# --tokenizer-name-or-path gpt2 \ +# --append-eod \ +# --workers 8 + +os.environ['MASTER_ADDR'] = 'localhost' +os.environ['MASTER_PORT'] = '12355' + +# initialize the process group +dist.init_process_group("nccl", rank=0, world_size=1) + #Initialize Megatron with dummy variables initialize_megatron( extra_args_provider=None, @@ -16,6 +36,9 @@ "num_attention_heads": 4, "seq_length": 256, "max_position_embeddings": 256, + "distributed_backend": "nccl", + "tokenizer_type": "PretrainedFromHF", + "tokenizer_name_or_path": "gpt2", } ) From 6b9e81a3eeaf49647ff88439784265820bcf2c59 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 13:37:42 +0700 Subject: [PATCH 154/297] changed with impossible token --- megatron/data/non_causal_mtf_dataset.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index cd46b6b58..b7ed7a1c7 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -234,19 +234,18 @@ def __init__( indexed_dataset, num_samples, seq_length, - seed + seed, + impossible_token=-100, ): # Params to store. self.name = name self.seq_length = seq_length + self.impossible_token = impossible_token # Dataset. self.indexed_dataset = indexed_dataset - # vocab - self.tokenizer = get_tokenizer() - # Checks assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset.sizes.shape[0] @@ -268,7 +267,7 @@ def __getitem__(self, idx): self.doc_idx[idx] ) - eod_idx = np.where(sample == self.tokenizer.eod)[0] + eod_idx = np.where(sample == self.impossible_token)[0] input_tokens = sample[:eod_idx] target_tokens = sample[eod_idx:] From 7feec27fc1705a26f31897bdc51a9ed25e2de2b5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 18:53:28 +0700 Subject: [PATCH 155/297] enable loading multiple indexed_dataset for each field --- megatron/data/non_causal_mtf_dataset.py | 67 +++++++++++-------------- tests/test_packing_dataloader.py | 30 ++++++++--- 2 files changed, 54 insertions(+), 43 deletions(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index b7ed7a1c7..029ff1fd5 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -172,11 +172,11 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) + indexed_dataset = {} + for field in data_prefix: + indexed_dataset[field] = get_indexed_dataset_(data_prefix[field], data_impl, skip_warmup) - total_num_of_documents = indexed_dataset.sizes.shape[0] + total_num_of_documents = indexed_dataset[field].sizes.shape[0] # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. @@ -251,25 +251,20 @@ def __init__( assert np.max(documents) < indexed_dataset.sizes.shape[0] # Build index mappings. - self.doc_idx, self.sample_idx, self.shuffle_idx = _build_index_mappings( - self.name, data_prefix, documents, self.indexed_dataset.sizes, + self.doc_idx, self.shuffle_idx = _build_index_mappings( + self.name, data_prefix, documents, self.indexed_dataset['input_tokens'].sizes, num_samples, seq_length, seed) def __len__(self): # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) - return self.sample_idx.shape[0] - 1 + return self.doc_idx.shape[0] - 1 def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] - sample = self.indexed_dataset.get( - self.doc_idx[idx] - ) - - eod_idx = np.where(sample == self.impossible_token)[0] - input_tokens = sample[:eod_idx] - target_tokens = sample[eod_idx:] + input_tokens = self.indexed_dataset['input_tokens'].get(self.doc_idx[idx]) + target_tokens = self.indexed_dataset['target_tokens'].get(self.doc_idx[idx]) return { 'input_tokens': np.array(input_tokens, dtype=np.int64), @@ -281,9 +276,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, num_samples, seq_length, seed, cutoff_last_epoch=0.95): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. - sample-idx: is the start document index and document offset for each - training sample. - shuffle-idx: maps the sample index into a random index into sample-idx. + shuffle-idx: maps the an index into a random index into sample-idx. """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) @@ -353,29 +346,29 @@ def _build_index_mappings(name, data_prefix, documents, sizes, np.save(doc_idx_filename, doc_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) - # sample-idx. - start_time = time.time() - # Use C++ implementation for speed. - # First compile and then import. - from megatron.data import helpers - assert doc_idx.dtype == np.int32 - assert sizes.dtype == np.int32 - sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, - num_epochs, tokens_per_epoch) - - np.save(sample_idx_filename, sample_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save sample-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) - # shuffle-idx. - start_time = time.time() - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) + # # sample-idx. + # start_time = time.time() + # # Use C++ implementation for speed. + # # First compile and then import. + # from megatron.data import helpers + # assert doc_idx.dtype == np.int32 + # assert sizes.dtype == np.int32 + # sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, + # num_epochs, tokens_per_epoch) + + # np.save(sample_idx_filename, sample_idx, allow_pickle=True) + # print_rank_0(' > elasped time to build and save sample-idx mapping ' + # '(seconds): {:4f}'.format(time.time() - start_time)) + # # shuffle-idx. + # start_time = time.time() + # # -1 is due to data structure used to retieve the index: + # # sample i --> [sample_idx[i], sample_idx[i+1]) if separate_last_epoch: num_samples_ = num_samples_from_epochs_minus_one else: num_samples_ = sample_idx.shape[0] - 1 - shuffle_idx = _build_shuffle_idx(num_samples_, - sample_idx.shape[0] - 1, np_rng) + + shuffle_idx = _build_shuffle_idx(num_samples_, doc_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) @@ -407,7 +400,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, sample_idx.shape[0])) print_rank_0(' total number of epochs: {}'.format(num_epochs)) - return doc_idx, sample_idx, shuffle_idx + return doc_idx, shuffle_idx def _num_tokens(documents, sizes): diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index 959c9343f..e8cd0f76d 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -10,16 +10,31 @@ import torch.distributed as dist -# To preprocess data before testing +## To preprocess data before testing +# TOKENIZER_PATH="gpt2" +# DATA_PATH="tests/data/t0/ag_news_classify_question_first.json" +# OUTPUT="tests/data/t0/ag_news_prompt" # python tools/preprocess_data.py \ -# --input tests/data/t0/ag_news_classify_question_first.json \ -# --output-prefix tests/data/t0/ag_news_prompt \ +# --input $DATA_PATH \ +# --output-prefix $OUTPUT \ # --dataset-impl mmap \ +# --json-key inputs \ # --tokenizer-type PretrainedFromHF \ -# --tokenizer-name-or-path gpt2 \ +# --tokenizer-name-or-path $TOKENIZER_PATH \ # --append-eod \ # --workers 8 +# python tools/preprocess_data.py \ +# --input $DATA_PATH \ +# --output-prefix $OUTPUT \ +# --dataset-impl mmap \ +# --json-key targets \ +# --tokenizer-type PretrainedFromHF \ +# --tokenizer-name-or-path $TOKENIZER_PATH \ +# --append-eod \ +# --workers 8 + + os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' @@ -43,10 +58,13 @@ ) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=["tests/data/t0/ag_news_prompt_text_document"], + data_prefix=[{ + "input_tokens": "tests/data/t0/ag_news_prompt_inputs_document", + "target_tokens": "tests/data/t0/ag_news_prompt_targets_document" + }], data_impl="mmap", splits_string="90,5,5", - train_valid_test_num_samples=[100,100,100], + train_valid_test_num_samples=[100,0,0], seq_length=1024, seed=124, skip_warmup=True From f84f29354aef238f71c01f97b17921444f534bf5 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 18:56:21 +0700 Subject: [PATCH 156/297] minor fix --- megatron/data/non_causal_mtf_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 029ff1fd5..f457d0efc 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -248,7 +248,7 @@ def __init__( # Checks assert np.min(documents) >= 0 - assert np.max(documents) < indexed_dataset.sizes.shape[0] + assert np.max(documents) < indexed_dataset['input_tokens'].sizes.shape[0] # Build index mappings. self.doc_idx, self.shuffle_idx = _build_index_mappings( From 2778d8d89aa8c4a06f831ae6c0cac1ac0bbb21b8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 18:59:43 +0700 Subject: [PATCH 157/297] data_prefix is set as dict --- megatron/data/non_causal_mtf_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index f457d0efc..31e0c109c 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -252,7 +252,7 @@ def __init__( # Build index mappings. self.doc_idx, self.shuffle_idx = _build_index_mappings( - self.name, data_prefix, documents, self.indexed_dataset['input_tokens'].sizes, + self.name, data_prefix['input_tokens'], documents, self.indexed_dataset['input_tokens'].sizes, num_samples, seq_length, seed) def __len__(self): From 61ac4b9d03913b6e99ecf050ef2b622934bf95ba Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 19:09:40 +0700 Subject: [PATCH 158/297] removed sample_idx lines --- megatron/data/non_causal_mtf_dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 31e0c109c..3742fdbb9 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -390,14 +390,14 @@ def _build_index_mappings(name, data_prefix, documents, sizes, doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' > loading sample-idx mapping from {}'.format( sample_idx_filename)) - sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' > loading shuffle-idx mapping from {}'.format( - shuffle_idx_filename)) + # sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') + # print_rank_0(' > loading shuffle-idx mapping from {}'.format( + # shuffle_idx_filename)) shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( time.time() - start_time)) - print_rank_0(' total number of samples: {}'.format( - sample_idx.shape[0])) + # print_rank_0(' total number of samples: {}'.format( + # sample_idx.shape[0])) print_rank_0(' total number of epochs: {}'.format(num_epochs)) return doc_idx, shuffle_idx From 62e3fb13a30d122b76945c93c2ac5dd8aa8dc246 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Fri, 24 Jun 2022 19:12:34 +0700 Subject: [PATCH 159/297] change line from sample_idx to doc_idx --- megatron/data/non_causal_mtf_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 3742fdbb9..952c7e62d 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -366,7 +366,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, if separate_last_epoch: num_samples_ = num_samples_from_epochs_minus_one else: - num_samples_ = sample_idx.shape[0] - 1 + num_samples_ = doc_idx.shape[0] - 1 shuffle_idx = _build_shuffle_idx(num_samples_, doc_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) From cb79f09e59478f3cbb9ae3ea1cee0c94db177cb8 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 25 Jun 2022 12:49:49 +0700 Subject: [PATCH 160/297] replace shuffling _build_index_mappings with random.sample of the doc_idx --- megatron/data/non_causal_mtf_dataset.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 952c7e62d..f23baac51 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -17,6 +17,7 @@ import os import time +import random import numpy as np import torch @@ -250,15 +251,18 @@ def __init__( assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset['input_tokens'].sizes.shape[0] - # Build index mappings. - self.doc_idx, self.shuffle_idx = _build_index_mappings( - self.name, data_prefix['input_tokens'], documents, self.indexed_dataset['input_tokens'].sizes, - num_samples, seq_length, seed) + # # Build index mappings. + # self.doc_idx, self.shuffle_idx = _build_index_mappings( + # self.name, data_prefix['input_tokens'], documents, self.indexed_dataset['input_tokens'].sizes, + # num_samples, seq_length, seed) + self.doc_idx = documents + self.shuffle_idx = random.sample(self.doc_idx, len(self.doc_idx)) def __len__(self): # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) - return self.doc_idx.shape[0] - 1 + # return self.doc_idx.shape[0] - 1 + return len(self.doc_idx) def __getitem__(self, idx): # Get the shuffled index. From e9cf22a34b6a508f38f928133a2211e730fc9f91 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 25 Jun 2022 12:50:07 +0700 Subject: [PATCH 161/297] minor changes --- tests/test_packing_dataloader.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index e8cd0f76d..4be4ff71b 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -2,14 +2,11 @@ import torch.distributed as dist from megatron.initialize import initialize_megatron -# from megatron.data.data_samplers import MegatronPackedRandomSampler -# from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.data.data_samplers import MegatronPackedRandomSampler from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets from datasets import load_dataset -import torch.distributed as dist - ## To preprocess data before testing # TOKENIZER_PATH="gpt2" # DATA_PATH="tests/data/t0/ag_news_classify_question_first.json" @@ -34,13 +31,6 @@ # --append-eod \ # --workers 8 - -os.environ['MASTER_ADDR'] = 'localhost' -os.environ['MASTER_PORT'] = '12355' - -# initialize the process group -dist.init_process_group("nccl", rank=0, world_size=1) - #Initialize Megatron with dummy variables initialize_megatron( extra_args_provider=None, @@ -64,7 +54,7 @@ }], data_impl="mmap", splits_string="90,5,5", - train_valid_test_num_samples=[100,0,0], + train_valid_test_num_samples=[50,0,0], seq_length=1024, seed=124, skip_warmup=True @@ -77,10 +67,10 @@ print(line) -# dl = torch.utils.data.DataLoader( -# train_ds, -# batch_size=4, -# # batch_sampler=batch_sampler, -# num_workers=4, -# pin_memory=True -# ) +dl = torch.utils.data.DataLoader( + train_ds, + batch_size=4, + batch_sampler=batch_sampler, + num_workers=4, + pin_memory=True + ) From acd87cd59c71339aabd4120043a08fc631067fa4 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 27 Jun 2022 13:35:06 +0200 Subject: [PATCH 162/297] Cleanup artefacts --- megatron/data/non_causal_mtf_dataset.py | 8 +-- .../t0/ag_news_prompt_inputs_document.bin | Bin 0 -> 12038 bytes .../t0/ag_news_prompt_inputs_document.idx | Bin 0 -> 2042 bytes tests/test_packing_dataloader.py | 60 ++++++++++-------- 4 files changed, 37 insertions(+), 31 deletions(-) create mode 100644 tests/data/t0/ag_news_prompt_inputs_document.bin create mode 100644 tests/data/t0/ag_news_prompt_inputs_document.idx diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index f23baac51..815faaa09 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -78,7 +78,7 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, seq_length, seed, skip_warmup, train_valid_test): ''' Build a single dataset group corresponding to Option 2 of data loading see arguments.py - a dataset group is passed on the following form + a dataset group is passed in the following form GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 or alternatively GIVEN_NAME PATH1 # for a single dataset to be used fully @@ -99,7 +99,7 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, else: data_prefix = [] - # data_prefix is on the shape: + # data_prefix is of the shape: # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] for w,p in zip(weights, paths): data_prefix += [w,p] @@ -251,7 +251,7 @@ def __init__( assert np.min(documents) >= 0 assert np.max(documents) < indexed_dataset['input_tokens'].sizes.shape[0] - # # Build index mappings. + # Build index mappings. # self.doc_idx, self.shuffle_idx = _build_index_mappings( # self.name, data_prefix['input_tokens'], documents, self.indexed_dataset['input_tokens'].sizes, # num_samples, seq_length, seed) @@ -280,7 +280,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes, num_samples, seq_length, seed, cutoff_last_epoch=0.95): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. - shuffle-idx: maps the an index into a random index into sample-idx. + shuffle-idx: maps an index into a random index into sample-idx. """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) diff --git a/tests/data/t0/ag_news_prompt_inputs_document.bin b/tests/data/t0/ag_news_prompt_inputs_document.bin new file mode 100644 index 0000000000000000000000000000000000000000..b786d6e414d6c5784520a20da2f254eb9c3e7add GIT binary patch literal 12038 zcmd6Nd3=m#*Z*~{dzs~)d-j=RlF3Xm*^x~WqLWMrNsT0yh}aSlNr;pXJ3&xEP*klg znxd9c<)N*mDAigXEhV}T+P<}vwogl|8|6J$^7hsD_e_4D&->r|htJ5&eP8EX=Q_*x ze9t8-{rvCoK_NR~ySPH_ARmy~{gqTE8ELB~g`1;#ir`k4Ls5c~d>3q{10yx0zkL)) zWU1+loCh!Oc9V3S19o=OkbPV(8Ocec4gw9|>+a(ky-&#hw)(+{_7DzN#_`2R;KYE- z+n%ibvpd*ySGQzwIK-PI;wGz%jR4>d@j|6ImeY_qoz38a2T^YFl(yH~SItfkVUJcP zNu&{8PS6|OWEK}OE*)Our&9<1Hc8=IgQ}e_Nb9~lQb~W)h-BHsCmPHY4v7g*?KkYR zEbi_mTpJM%YhAcfL$(*xadj|FvV)u4nq5srIHaGX2oMBeA}_+C04C2(K2M)dFF-Fi zu-mufn^SddmvI-`y(etYZhJYPV_uE_5i$J3}v8+~oIYwM`;6_)$71nrJ#?>U>MB+PUA5cEk=>K|`KZZ5Abx zT3k<0JZA^3a9CRo(cP4!(PgH|@E-II{+jzy+Ha6Z!JuA32ZIaV=Ozl<#!F<40-&B= zmQFmE9Wt1IPEo~)`2Jqw*g)S>f9HSGtT5MUBr=)%u;&8*RWL_I2e9hc)HGuyBStjKWVgFN-Ni0$N^t{a#=DLsao$&35TYlxRR#K7;vYD1Pu(SJ6lLwpq z%f(3weR7C-iKYSmlB_2(v5Id>%-g2Rf?)seU;Ib~O8Z@RxV=O~c&^VGtKMv)iCW+G zu)kWXs^>n=tfu3l%VDZCU0AB!ZFqt3mOrAvEOJJVrf&9oV7%^+!eYs}l3%<2T-7``yhhd7sgcix~ z4q%^VdSAIO8Q{e*8LcT3CpA3D? zu`~BESR!Qf5{cA6kxn92Fud&m{u|RolvlYg2EG@zKm|Roh^FKD(WtMltn%+UtBEX3 z_+xa7$N9=zO$7A+o4ob^W{v;3{@*i46LDdF`1j1wMD9C%XO1Q^Soi>` zd#~;dPzf^eBR7R6@>Ay1w16*%E@a6>C2>H)QxdtZ%R@PzJ598SWa3$jwq?P7!x`#T zKJfasygOAxsx`GzxN}leo#usM8WKm|q5sNkBFErbl zhmPa99W%c!%@R9B?4Ix$+6drb)E4*@ZPp&+!jK-<)wdp@WM`r}D;#TEI$TJc9sbPR z|LGsKu+2qacH7h4nVcqaoE8W(4FH?Pzt(QjIAbw4sjYGo8EFjbnvULoSeplDgIuth z&QsfA2EMmc(-rq>RfsTAcRI+oE(zol?TdOF^ssh>4A&iQAWX-9x9IPaB1_w8KrFWy=qqDG^HPUaH})LlB># z_6Vc#L?fIJVUeR4EUoy^REgRChVWAlX8&5OYV7Yu@OZa0`^RQ+k<1Z-x!#X#2xGNr zojfthKWl+XBHb0tgQm+HExz}5e$X*$8%B$5m5oc^=BouP?P!adZr!EXVc z9r+e!%s!Sa5Tn zSA;YhU?p%ul%ESO`rk14UO`;*yDJ^O^RMl6M0H64aWiyvP`BO=k5QtTG~h~~speU( z={Z>LSlgKf(PGIl(UYB;9Z?8O*4j%<=7ymVBDdz8@*dw^QS9zR< zbnUFtkS@i?A|-MOec-Ncc7X_MRc75+P=i(A5n8HisPA>#k-m}^UZmgezN@=wtJDot zFk~eseZRX`dzQ_H>rO;h1v(&L-)rcdg7CCKt9`4R3oa>ap2}9RYn`GS%x(WL{=VN9 zaF!0pz2p$_lSNIf(+=|lFZ`)EP6axesuc`%^m1;2e_@6y*xNOTU*m+Ct`~EBYdfX$ z4#Qnthc|xltA$zOA(aS2O*;%~;|(P{vG_L2tsbz$9>r=}Fvbg;w1Zr}*O|Fn5AV%! zVz;q8E~a#Vv3N8)p}es7=G?M+VW0uK2hA5^i7P-+V@3_ZWwFEZxQ|gLvkA*^ZDR`&9WV?8EoLz48^Ks#7h2CCO`q^;UA@C=&_T~4lK`) z2v@(pzm>SkUPd%qZSP6UWAvEQ(xcK~oA8{(BI@mNd2lZ?YK#i}QPO?=U(b}oX9sc+ zS)<XMKw*C0W|Ine4a%IrzsYxxD;HU#HU{5 zwu|-n8Liwd`rchIF%7)kBM+ucTNh~>c(3nP?IUo4Lo<(e!9n|HLLA2Pul78s(61Kv zM@ZzQz+1{KP(3Jz)F@rx4LC?$T03ZF;?qzydkl76E{Ib9j5U1@R^x9=R*Tn^#cncJ50l7ntd7_)GiQqmtRb^>`7(e$?JR^VAK`E1x;~zX zsP*SjZNQ`w-@)JDS(h zCfe+fjB)R^u=RAU#P@XTJ?<3tZN1hw;4Sqrt%5s?9r^$@=Ajsgl=wG5c>g7~%3kHL zGSL6oj?l?=87r{6Fn}yo$)vxK%GKZr#AGrrY(x)~wC6f9e=kFu)-_|j0qT!oTn5|8 z+88fP9LTbpKtr#JQ}CQdd!*(_>gT|24F1WpomA?sV$B^-N9(d6A0}el=3qDcDSG0MZr|}! z^u{c48yI1kVl6&3HPD85KMj5nvdQn68sgGi#VGZFT)j-OOVJ70jVfUR^!{<2jlplxj;tAB8h?aYpfebG7^o8J*(h zN`C1bR?nQUo*!y27cGc0!%A4y^u7P2(4qDgy#mOKCO_@Z(^=l$$@L)rcXSL!%e>?n z63D8zpwEL3W8Z}|SncHb{cRp)k43IPgk6f0$RJMR9#apP zDZ`Ctuv76Mw{roCVTa+Eb~G)uzYR)54=tr-iUDsi-l9t(=NydQJo^BX zSx-;uf{hORRwDQOT(nDJ9yGLFOdljWVX$p5qJ0P<#a53#si91j?*|q=t<%~B<88dd2=iHETgEW zygTrpPn`?A!_D|b*-v8k?t1HaFLT;(TdSKs@nQ$*)p##KDL6(*|hD*;}! z4oAO=3dw`rnO?qAln1-h5+TLRy0#$fj>pnPu`c|-m(N0MmA(=4c$9{eguEJ|#ePrwTKznHj_8XSd-*Co&ch8Z$Zs@CWNbRW<*iaNvrtoDr(x*C1G|2sd-D#EuM;aivS=>)X)!;AQ@Ka9UpneQa6n zSj9-Qm%>yhtA5Nsn#i`0vB5)Bok>uL2y?YnI(tw9OjICZYQU#yFS(jqpz0brnwB9# z$)+D-H^$y-S@lmE(af@5hd*gP=&;%MrOmvkXH*WzkpdGXGSu?E$wp#h)8W$zYl}Nz zEoDCYSeiwdMn)y8wR{qb^DKA=J;Ym42aVC4c_)nLb}D|#lrSz?*LIs{nD#7O;-XS8 zegje|tLlsVFTxV6)B?U;HV?oWy;696>=Z>h{+1~oRI^?;3+91#IA~gG<45`T%wK3d z$m;+MLabs+tVu`K)7lF$QMKW*q#-L6XY%5y@11SV*=&6vvn;b#^+nJ(e1<+yTbSJw zy`YDc{UYNp4Z=D@6S+`egvh`M^uXf^GseLNtZ^djM`ThX#bV_>5^BZS5Sy9sb~C*S zk%+}utgSO-K?HRO#gYSJ!I))5>-)0tK<&%I=RGQPf50NK7P8g33L8mvC{jmYK1m9= zp&G^2V>Qj@YGHQ3ApR%S1}GArw}hFnALIsicEHPc68=M*V)2AxlO6M!KNd%1$i_-%xQYM(h(`GIg~H=-HXXjre<9oJt+- z(2!*?XV^v9Vs=4qE^@-noXLCx_6oD(*IATWkuYoYk5M27`n=?_m@Kor8Gya?j(LOE z_}=SS=b^SQCx1fMqkgmXwJs}!npZ^U5vJ`CSNBl+bjMb z>vSIUHmmMj{Oy|cZX06g`r-9T9p49f=XdSLH27!zbmuT}Ah*c>3U7y%$Y-Qs@5z4I zhaD)u?HWF5iXu&%n}KtT%q-0ReR)lqO>|MVhTnEPJN@JdF6{oULOQ;6o?E24>@$mf z(mvJU-Tbk0zr7ag=>~bh@l_VK<9mFnBR6|jQ6ns+b#bYf-LEv)i^41xIbQU~Vkn_C zspU(Zkdm>K?!-x3+`29=hDD4C#hSFh1!cl!bt-mODH+TbgJC!FDotcox=gYKBi2=% z2BCkSlw1%bQ2HCSQqoHGla+PN8qG3#Aq}~(G`oW^)?*{Brwxb}j z@bL0MAfXKw7;$Q#8AAIanre~Zf?|E<8zS<#(Oe}z3MwgPJpbqTA9xlcuPQ_l5(dz{rx2}-4HP3 z9-<)^EM7dDXMO1_+Ef~?OtYvX9f%JmmHh|WSE!KF;P;E$gt=kCbZ^d68j*kxQH1;fUcZcJ3Q^_EsKd(p{=;tsuemofGvv$hfRRaaH^nGy8uN z*$IMh(Y^~l6KtUJUx9UJHN6$fo*1l*cCv{00a4;VLanig3Y*`lPkUo~uIn0Na92M3meUC%r$I79yYOf&&3dV@iYpY8N;I3WQ4SUol>YIfU`vE$eAo zLCHxtFKI*A#6|J+*YlRc9{7>fTz-zgIum~qLdM(8-c{gtphT!`p0Kn$9j8jxoi>(3 z4DmQ|E)8Gz$5Tmad7s(P4rR(oik=3SnlZ_bh3z^oEY@7svC8+|PKnH+Z%XAji)S)$ zVng190e zX3tqvCwrKkb2qJsW7Y3=be0Cu8RE_wKRfbMmoZz^LY1u_)=$Q~vnMcqvsXttF;n@L z9%oK8?ABaEJac7-oo8<`*$)r{xWh)l0p+h4u_Ejh{P2XY?6^gH;o3}M?k1TW8u+Vi zseUf)5`Pw!aF@8cXgd^X59wKSeMo0vj`me)laJ4@F|IGe+HiVE8uVAY^4X1p+Jsj! z%k&LU-T~k%{a~dNen1R0U7n=4y8!Qm7;J>|=%l?-c_hfN1->|nI`H8Zx9zLyCiKSM zenB(yvp&I#0Z4L4=Gf<;iSS2LTU0R}1BFe~FKNh|bUCq2z^6=v+ zr}gqInH*7T)u%%D*uIVTz5h6}QJmYhyvJ#=sz(~uwGoD~A^B~ur~twjIiW&+T~d!^ zCmkTLxr0mq`JjfLv(?NSZX{MBIvp*&!29l`!pSOyBK)C0-@Oj#@hFUUH`&u0r;@e;#3?7yZoV}=nrH_DUn`&DMC@*0 zU`%mXk0)9(R@?zsrR8e8krO;rw`T{-K;@XgeIJWYr$eIFgw@YYhDEQ>Vl$=N@h5sL)G&;k#mx}i41o-S=jw#XqPEw zV3$;Y)}FQ)ZH*Q<&qd-anw{hY^pg~!0IOvcoW~utCo3RKy%B(kJw)K)YP+L1e&ld$985b1&e%BS#kzI^$%_1E(ZsPIW68FOvKa?XUaEP=<`x zgl&PLPi`L7!jgi4gLhPU!R2=*{t7a$7yLR{&LUr0Vuhglcmu;vxIZKSYy2$w04JKs z+38TG5A4u#kKuNFe|2=btF2UDZNWED{{DA_=i>gpXLgI=D}9o}P5#*XuBMvKP;1B| zqv)_hM{`uT2d!`}tRr@eRXUdUef!_tb}u?TqwGZs`}{Mk9Z5X0u#Zg6yv}A(dbID* z%=`{YvN1frgPHD^%yfcu3-Rqxi!V>Udw5URoqpoH6Kzw*KGE`WcNRAl`cjjdtacBG zIj4KW9gS-+bmzqLD46#S5IMgr9WHmz!`y#Xi}wM?vySr~cIP7Gm$4g&g?LJdu%2_1 zYGexl)~nw!F)wUwCc4;@2PAD~-0io4hLT5kN6KLxqg0Zq`Y}o~t*Bc>$0+9Ew7#C+ zHK*{$=-a(S@&%XAjm+|GrSCj!c6Xc~YTE3dp=G>wIb?TybFz}~`Zp(=Y0Ejv|PoY zIe4-9Epef^V0Z)c$C>+9?NXe^C>8>o)aJt%9Gi81s|WZQS(Zz2e|a9y6=UsV_ukI5 zb-{$B*1WCPNQo@(Hq3DYQOcr@3;dtx5UgZTRES-ghg*vCkSbxF@!Q^hp2s*VdRtt8 zH(>gz+r*17-_jHBx=1($v|7Kx=XN5OGMswwCMnAmyHw03`SU+hKjKbeA9NA#&H3`E zVddu*EXV9v6xN|MKJ$qdX~-+T*uCzB-`gP$tYtMOe{~PYvF7!es2vkF5eC?C#*Rp@ zDC}azLkOj;7HPGZ6SYh($(s^ZppnT?WH$C&eM>G(FK?KgRsp|hYM_^KOxWLLXUJyH z9bGVdDi`Bw8#G@ka}Dfn)lokcGQrakP0vM6p*x*{eSnBsXh=Ahfmu}!DnLsBe5qvj zJAKPNzk{EV2g}4tZWX+X(e3T@l#rA7M7pA@2=Du|xOj7%L=Fm0m}0%N_rnQx^noJn zNaMPNGPzXttO0LW#ectN5In#=Dwn0X8=zmr76`PNy7-oxh!yQ{=GwOV`Pe7*5+us!r%vV?1{ui{cvK zuyO|4;R@>c&ED7gRl)Y++t~9i#@Mc37)@^&TPlceU5>1M6s~}mrZ8qMs^{~Tueg_U z%=B3N6K$%eC0nQYc`HI`K5BES*9OX-`4`re#E1jvFXs_|ol(T2v`dz$=&OI-$+}@| zuVj_Wx7<*&sdH6F5`5bGjAhT??nqx!uf$t2F6@!n>~|kBNN$ox!kX*)2xWC zakezM>pIb~&I>=`ZQZ+b83Q{Pk<8o8d6xAV6Sy4ge%vHW7#h@i^(ag~)9sT(!MuK1 zV!bE-hS)7cB7DwiI2>CJ=kz^e81ht-y64%ancDdfoLXkcH;jETHn!sJn+#PJ-Z))^ zcjS^>pKgetClTGTnGL&PrfpYE+irBiSLD&L`6ivry>ZrB0ToR1zE)vG!$wqh_SFV#1La)VXTa?rm&ze zR1}8V%2-ekVyG+(F%-ta`R{x$F;4RHotb+%@7#O-JEx|qA?;4$Y# z*E!c2)Z)D0s&_qbop7CZb-GkfyBb|(sMeWsmAXn?M_l_|M_t7()iU>Lta>3e{a=Nv z$aUP6`Q0ub54p5nt)tS_=DO;ta%m2++_l%G-^)Et^-KTea7U{ainosN1 zHO-lcn+)dQKvoRCz{X{fe}g^CBcFwnD>t0JF(6{{nkhPm01KZZYH`ZPdr%cUZA5>M!6NtX?1Wmv8~5Hbi|4&coJ?QGWxM;OM5P|Ay~3M}B`xFts%} z3G=s+!@lj2&%lBmk-vhAuq-F)kKh!{+Zpwza2i@})SKZr%-t3BMffZ)@{!%aKX7bM zA9-qD@GWdCh}}ZDu&-fP!=8p64f`2(Gi+wq%&?hZFT+-boecXJcCr8LVc5X1fwBF@_9s0} zkIS|j+ieBB1Gd-LR%1JpzNcH*#-t}IXWNpVq@3+Zdb4u2Bk9x1dF@GmQm!7QP>)vV zOi+{u>?BmbQ&a}*6s&^Pum+xn>YWPpM1}gHLg$V`=Ypa(`5R}JhIQoiFbx}EBW!}r zum!flHrNh3U?;o;)$0{k0?C=7VHf!|*bRGNFYJT;Q0I=~dcX$Z4VXCxZjs-Hci>$( H1c%{YQkiNZ literal 0 HcmV?d00001 diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index 4be4ff71b..6d8367c3d 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -1,39 +1,35 @@ import os import torch.distributed as dist +import torch + from megatron.initialize import initialize_megatron from megatron.data.data_samplers import MegatronPackedRandomSampler from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets -from datasets import load_dataset +""" +To preprocess data before testing + +TOKENIZER_PATH="gpt2" +DATA_PATH="tests/data/t0/ag_news_classify_question_first.json" +OUTPUT="tests/data/t0/ag_news_prompt" -## To preprocess data before testing -# TOKENIZER_PATH="gpt2" -# DATA_PATH="tests/data/t0/ag_news_classify_question_first.json" -# OUTPUT="tests/data/t0/ag_news_prompt" -# python tools/preprocess_data.py \ -# --input $DATA_PATH \ -# --output-prefix $OUTPUT \ -# --dataset-impl mmap \ -# --json-key inputs \ -# --tokenizer-type PretrainedFromHF \ -# --tokenizer-name-or-path $TOKENIZER_PATH \ -# --append-eod \ -# --workers 8 +python tools/preprocess_data.py \ + --input $DATA_PATH \ + --output-prefix $OUTPUT \ + --dataset-impl mmap \ + --json-key inputs \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_PATH \ + --append-eod \ + --workers 8 +""" -# python tools/preprocess_data.py \ -# --input $DATA_PATH \ -# --output-prefix $OUTPUT \ -# --dataset-impl mmap \ -# --json-key targets \ -# --tokenizer-type PretrainedFromHF \ -# --tokenizer-name-or-path $TOKENIZER_PATH \ -# --append-eod \ -# --workers 8 -#Initialize Megatron with dummy variables +# Initialize Megatron with dummy variables initialize_megatron( extra_args_provider=None, + allow_no_cuda=True, args_defaults={ "micro_batch_size": 4, "num_layers": 4, @@ -51,14 +47,14 @@ data_prefix=[{ "input_tokens": "tests/data/t0/ag_news_prompt_inputs_document", "target_tokens": "tests/data/t0/ag_news_prompt_targets_document" - }], + }], data_impl="mmap", splits_string="90,5,5", train_valid_test_num_samples=[50,0,0], seq_length=1024, seed=124, skip_warmup=True - ) +) print("Test show dataset") for idx in range(0,4): @@ -67,10 +63,20 @@ print(line) +batch_sampler = MegatronPackedRandomSampler( + sequence_length=256, + dataset=train_ds, + total_samples=len(256), + consumed_samples=0, + micro_batch_size=4, + data_parallel_rank=0, + data_parallel_size=1 +) + dl = torch.utils.data.DataLoader( train_ds, batch_size=4, batch_sampler=batch_sampler, num_workers=4, pin_memory=True - ) + ) From 019ed7c95b1faa9ea3c00df7628f986329d41c6c Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 28 Jun 2022 11:30:33 +0200 Subject: [PATCH 163/297] Add packed preprocessing --- megatron/data/data_samplers.py | 89 +++++++++++++++++++++++-- megatron/data/non_causal_mtf_dataset.py | 8 +-- megatron/utils.py | 54 +++++++++++++++ pretrain_t0.py | 71 ++++++++++++++++++++ tests/test_packing_dataloader.py | 76 +++++++++++++++++---- 5 files changed, 276 insertions(+), 22 deletions(-) create mode 100644 pretrain_t0.py diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 907a82371..8a7bfd457 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -15,13 +15,75 @@ """Dataloaders.""" +from functools import partial +import numpy as np import torch -import random + from megatron import get_args from megatron import mpu +def pack_samples(items, max_seq_len=2049): + """ + Input: + [{'input_tokens': array([ 6, 7, 8, 3]), + 'target_tokens': array([4, 5])}, {'input_tokens'... + + Output: + decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]] + decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]] + decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]] + """ + + decoder_target_tokens = [[]] + decoder_segment_ids = [[]] + decoder_causal_attention = [[]] + + batch_num = 0 + item_num = 0 + cur_len = 0 + for token_dict in items: + input_token_len = len(token_dict["input_tokens"]) + target_token_len = len(token_dict["target_tokens"]) + total_len = input_token_len + target_token_len + if cur_len + total_len > max_seq_len: + len_diff = max_seq_len - cur_len + # Padding + if len_diff > 0: + decoder_target_tokens[batch_num].append(np.zeros((len_diff))) + decoder_segment_ids[batch_num].append(np.zeros((len_diff))) + decoder_causal_attention[batch_num].append(np.zeros((len_diff))) + batch_num += 1 + item_num = 0 + cur_len = 0 + decoder_target_tokens.append([]) + decoder_segment_ids.append([]) + decoder_causal_attention.append([]) + + decoder_target_tokens[batch_num].append(token_dict["input_tokens"]) + decoder_target_tokens[batch_num].append(token_dict["target_tokens"]) + cur_len += total_len + + decoder_segment_ids[batch_num].append(np.ones((total_len)) + item_num) + decoder_causal_attention[batch_num].append(np.ones((input_token_len))) + decoder_causal_attention[batch_num].append(np.zeros((target_token_len))) + item_num += 1 + # Padding + len_diff = max_seq_len - cur_len + if len_diff > 0: + decoder_target_tokens[batch_num].append(np.zeros((len_diff))) + decoder_segment_ids[batch_num].append(np.zeros((len_diff))) + decoder_causal_attention[batch_num].append(np.zeros((len_diff))) + + return { + "decoder_target_tokens": np.stack([np.concatenate(arr) for arr in decoder_target_tokens]), + "decoder_segment_ids": np.stack([np.concatenate(arr) for arr in decoder_segment_ids]), + "decoder_causal_attention": np.stack([np.concatenate(arr) for arr in decoder_causal_attention]), + } + + + def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): """Buld dataloader given an input dataset.""" @@ -46,7 +108,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): data_parallel_size=mpu.get_data_parallel_world_size()) elif args.dataloader_type == 'packed': batch_sampler = MegatronPackedRandomSampler( - sequence_length=args.seq_length, + sequence_length=args.seq_length + 1, dataset=dataset, total_samples=len(dataset), consumed_samples=consumed_samples, @@ -60,10 +122,15 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): if num_workers is None: num_workers = args.num_workers + collate_fn = None + if args.dataloader_type == 'packed': + collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1) + # Torch dataloader. return torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=num_workers, + collate_fn=collate_fn, pin_memory=True) class MegatronPretrainingSampler: @@ -171,9 +238,10 @@ def __iter__(self): class MegatronPackedRandomSampler(object): """docstring for MegatronPackedRandomSampler""" - def __init__(self, sequence_length, total_samples, consumed_samples, micro_batch_size, + def __init__(self, sequence_length, dataset, total_samples, consumed_samples, micro_batch_size, data_parallel_rank, data_parallel_size): # Keep a copy of input params for later use. + self.dataset = dataset self.sequence_length = sequence_length self.total_samples = total_samples self.consumed_samples = consumed_samples @@ -216,10 +284,21 @@ def __iter__(self): idx_range = [start_idx + x for x in random_idx[bucket_offset:]] batch = [] + batch_count = 0 + token_lens = 0 # Last batch if not complete will be dropped. for idx in idx_range: - batch.append(idx) - if len(batch) == self.micro_batch_size: + tok_len = len(self.dataset[idx]['input_tokens']) + len(self.dataset[idx]['target_tokens']) + if token_lens + tok_len > self.sequence_length: + batch_count += 1 + token_lens = 0 + + if batch_count == self.micro_batch_size: self.consumed_samples += self.micro_batch_times_data_parallel_size yield batch + batch_count = 0 batch = [] + else: + token_lens += tok_len + batch.append(idx) + diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/non_causal_mtf_dataset.py index 815faaa09..129cf6816 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/non_causal_mtf_dataset.py @@ -252,11 +252,9 @@ def __init__( assert np.max(documents) < indexed_dataset['input_tokens'].sizes.shape[0] # Build index mappings. - # self.doc_idx, self.shuffle_idx = _build_index_mappings( - # self.name, data_prefix['input_tokens'], documents, self.indexed_dataset['input_tokens'].sizes, - # num_samples, seq_length, seed) - self.doc_idx = documents - self.shuffle_idx = random.sample(self.doc_idx, len(self.doc_idx)) + self.doc_idx, self.shuffle_idx = _build_index_mappings( + self.name, data_prefix['input_tokens'], documents, self.indexed_dataset['input_tokens'].sizes, + num_samples, seq_length, seed) def __len__(self): # -1 is due to data structure used to retieve the index: diff --git a/megatron/utils.py b/megatron/utils.py index 98d2f611c..b89861bbe 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -250,6 +250,60 @@ def get_ltor_masks_and_position_ids( return attention_mask, loss_mask, position_ids +def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, segment_ids, datatype=torch.int64): + + inputs_mask = decoder_causal_attention.unsqueeze(-1) * decoder_causal_attention.unsqueeze(1) + inputs_mask = inputs_mask.unsqueeze(1) + + """Causal Inputs Mask: + mask = [[[[1, 1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 1]]]] + """ + causal_inputs_mask = torch.logical_or(causal_mask, inputs_mask).to(datatype) + + """Padding Mask: + mask = [[[[1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0]]]] + """ + padding_mask = (tokens > 0).unsqueeze(-1) * (tokens > 0).unsqueeze(1) + padding_mask = padding_mask.unsqueeze(1) + + + """Segment Mask: + mask = [[[[1, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0]]]] + """ + segment_mask = (segment_ids.unsqueeze(-1)) == (segment_ids.unsqueeze(1)) + segment_mask = segment_mask.unsqueeze(1) + + """Final Mask: + mask = [[[[1, 1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0]]]] + """ + attention_mask = causal_inputs_mask * padding_mask * segment_mask + + return attention_mask + def param_size(parameter): return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement() diff --git a/pretrain_t0.py b/pretrain_t0.py new file mode 100644 index 000000000..baac0a9f1 --- /dev/null +++ b/pretrain_t0.py @@ -0,0 +1,71 @@ +"""Pretrain T0""" + +import torch + +from megatron import get_args, get_tokenizer, mpu +from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask + + + +def get_batch_pipe_packed(data): + """ + Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator` & in packed fashion + + data: + decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]] + decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]] + decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]] + """ + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['decoder_target_tokens', 'decoder_segment_ids', 'decoder_causal_attention'] + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + tokens_ = data_b['decoder_target_tokens'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + segment_ids = data_b['decoder_segment_ids'].long()[:, :-1] + decoder_causal_attention = data_b['decoder_causal_attention'].long()[:, :-1] + + # Get the masks and position ids. + causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=None, + loss_on_targets_only=False # This is done below + ) + # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding + loss_mask *= torch.logical_and((decoder_causal_attention - 1) * -1, tokens) + loss_mask = loss_mask.to(datatype) + + attention_mask = get_packed_attention_mask( + causal_mask=causal_mask, + tokens=tokens, + decoder_causal_attention=decoder_causal_attention, + segment_ids=segment_ids, + datatype=datatype, + ) + + if args.curriculum_learning and args.curriculum_seqlen < tokens.size()[1]: + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] + tokens = tokens[:, :args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() + labels = labels[:, :args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index 6d8367c3d..287e8c700 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -1,11 +1,19 @@ +from functools import partial +import sys +from pathlib import Path import os -import torch.distributed as dist import torch +# Insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[1]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) + from megatron.initialize import initialize_megatron -from megatron.data.data_samplers import MegatronPackedRandomSampler +from megatron.data.data_samplers import MegatronPackedRandomSampler, pack_samples from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets +from megatron.utils import get_packed_attention_mask """ To preprocess data before testing @@ -23,7 +31,31 @@ --tokenizer-name-or-path $TOKENIZER_PATH \ --append-eod \ --workers 8 + +python tools/preprocess_data.py \ + --input $DATA_PATH \ + --output-prefix $OUTPUT \ + --dataset-impl mmap \ + --json-key targets \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_PATH \ + --append-eod \ + --workers 8 +""" + + +""" +Define Environment variables if necessary """ +os.environ["RANK"] = "0" +os.environ["WORLD_SIZE"] = "1" +os.environ["MASTER_ADDR"] = "jean-zay-pp2" # $(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +os.environ["MASTER_PORT"] = "6002" +os.environ["LOCAL_RANK"] = "0" + + +seq_length = 256 + # Initialize Megatron with dummy variables @@ -35,13 +67,13 @@ "num_layers": 4, "hidden_size": 64, "num_attention_heads": 4, - "seq_length": 256, - "max_position_embeddings": 256, + "seq_length": seq_length, + "max_position_embeddings": seq_length, "distributed_backend": "nccl", "tokenizer_type": "PretrainedFromHF", "tokenizer_name_or_path": "gpt2", - } - ) + } +) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=[{ @@ -51,7 +83,7 @@ data_impl="mmap", splits_string="90,5,5", train_valid_test_num_samples=[50,0,0], - seq_length=1024, + seq_length=seq_length, seed=124, skip_warmup=True ) @@ -64,9 +96,9 @@ batch_sampler = MegatronPackedRandomSampler( - sequence_length=256, + sequence_length=seq_length, dataset=train_ds, - total_samples=len(256), + total_samples=len(train_ds), consumed_samples=0, micro_batch_size=4, data_parallel_rank=0, @@ -75,8 +107,28 @@ dl = torch.utils.data.DataLoader( train_ds, - batch_size=4, batch_sampler=batch_sampler, num_workers=4, - pin_memory=True - ) + pin_memory=True, + collate_fn=partial(pack_samples, max_seq_len=256), +) + +for i, items in enumerate(dl): + + micro_batch_size, seq_length = items['decoder_target_tokens'].shape + causal_mask = torch.tril( + torch.ones( + (micro_batch_size, seq_length, seq_length)) + ).view( + micro_batch_size, 1, seq_length, seq_length + ) + + mask = get_packed_attention_mask( + causal_mask=causal_mask, + tokens=torch.tensor(items['decoder_target_tokens']), + decoder_causal_attention=torch.tensor(items['decoder_causal_attention']), + segment_ids=torch.tensor(items['decoder_segment_ids']), + ) + + assert mask.shape == (micro_batch_size, 1, seq_length, seq_length) + From 7619f7a6bda08faaacf6af0d0a98fc929d838cb2 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 28 Jun 2022 11:54:42 +0200 Subject: [PATCH 164/297] Use seq_length arg --- tests/test_packing_dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index 287e8c700..4dcceda9f 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -110,7 +110,7 @@ batch_sampler=batch_sampler, num_workers=4, pin_memory=True, - collate_fn=partial(pack_samples, max_seq_len=256), + collate_fn=partial(pack_samples, max_seq_len=seq_length), ) for i, items in enumerate(dl): From 219209acdf08fe312d69146d798a4d02a8a031d4 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 28 Jun 2022 11:57:39 +0200 Subject: [PATCH 165/297] Add sources & docstrings --- megatron/data/data_samplers.py | 10 ++++++---- megatron/utils.py | 6 ++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 8a7bfd457..af7a983c4 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -26,9 +26,9 @@ def pack_samples(items, max_seq_len=2049): """ - Input: - [{'input_tokens': array([ 6, 7, 8, 3]), - 'target_tokens': array([4, 5])}, {'input_tokens'... + Items: + [{'input_tokens': array([ 6, 7, 8, 3]), + 'target_tokens': array([4, 5])}, {'input_tokens'... Output: decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]] @@ -237,7 +237,9 @@ def __iter__(self): class MegatronPackedRandomSampler(object): - """docstring for MegatronPackedRandomSampler""" + """ + To be used with pack_samples collate_fn + """ def __init__(self, sequence_length, dataset, total_samples, consumed_samples, micro_batch_size, data_parallel_rank, data_parallel_size): # Keep a copy of input params for later use. diff --git a/megatron/utils.py b/megatron/utils.py index b89861bbe..799196c10 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -251,7 +251,9 @@ def get_ltor_masks_and_position_ids( def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, segment_ids, datatype=torch.int64): - + """ + Inspired by https://github.com/google-research/t5x/blob/7193407f98a8b18100b71a04ff777238be1682ca/t5x/examples/decoder_only/layers.py#L978 + """ inputs_mask = decoder_causal_attention.unsqueeze(-1) * decoder_causal_attention.unsqueeze(1) inputs_mask = inputs_mask.unsqueeze(1) @@ -303,7 +305,7 @@ def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, seg attention_mask = causal_inputs_mask * padding_mask * segment_mask return attention_mask - + def param_size(parameter): return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement() From 67424d6dd67312db12f3ab53c3946c92d7aacf1d Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 29 Jun 2022 11:58:06 +0700 Subject: [PATCH 166/297] added training process for t0 --- examples/finetune_t0.sh | 44 +++++++++++++ pretrain_t0.py | 142 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 examples/finetune_t0.sh diff --git a/examples/finetune_t0.sh b/examples/finetune_t0.sh new file mode 100644 index 000000000..1bf1e81a6 --- /dev/null +++ b/examples/finetune_t0.sh @@ -0,0 +1,44 @@ +#! /bin/bash + +# Runs the "345M" parameter model + +RANK=0 +WORLD_SIZE=1 + +DATA_PATH="{ \ + 'input_tokens': 'tests/data/t0/ag_news_prompt_inputs_document', \ + 'target_tokens': 'tests/data/t0/ag_news_prompt_targets_document' \ + }" +CHECKPOINT_PATH="./checkpoints" +TOKENIZER_PATH=gpt2 + +deepspeed --num_gpus 1 pretrain_t0.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --micro-batch-size 4 \ + --global-batch-size 8 \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --train-iters 500000 \ + --lr-decay-iters 320000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --checkpoint-activations \ + --log-interval 100 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --fp16 diff --git a/pretrain_t0.py b/pretrain_t0.py index baac0a9f1..7e6813040 100644 --- a/pretrain_t0.py +++ b/pretrain_t0.py @@ -1,10 +1,55 @@ -"""Pretrain T0""" +"""Multitask Finetuning T0""" import torch +from functools import partial -from megatron import get_args, get_tokenizer, mpu +from megatron import get_args, get_tokenizer, get_timers, print_rank_0, mpu +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import os + +try: + from torch.distributed.elastic.multiprocessing.errors import record +except ImportError: + # noop + def record(fn): + return fn + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe_packed + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process + ) + see_memory_usage(f"After Building Model", force=True) + return model def get_batch_pipe_packed(data): @@ -69,3 +114,96 @@ def get_batch_pipe_packed(data): loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + if args.curriculum_learning and args.curriculum_seqlen < args.seq_length: + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + + import json + data_path_dict = [json.loads(args.data_path)] + + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + + # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + # elif args.train_weighted_split_paths: + # assigned_train_valid_test = [] + # if args.train_weighted_split_paths is not None: + # train_ds = [] + # assigned_train_valid_test.append("train") + # if args.valid_weighted_split_paths is not None: + # valid_ds = [] + # assigned_train_valid_test.append("valid") + # if args.test_weighted_split_paths is not None: + # test_ds = [] + # assigned_train_valid_test.append("test") + + # for s in assigned_train_valid_test: + # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + # eval(f"args.{s}_weighted_split_weights"), + # eval(f"args.{s}_weighted_split_splits"), + # eval(f"args.{s}_weighted_split_names")) + # for paths, weights, splits, name in data_groups: + # d = build_dataset_group(name, paths, weights, splits, + # args.data_impl, + # train_val_test_num_samples, + # args.seq_length, args.seed, + # (not args.mmap_warmup), + # train_valid_test=s) + # eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating T0 datasets ...") + return train_ds, valid_ds, test_ds + +@record +def main(): + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) + +if __name__ == "__main__": + main() \ No newline at end of file From a7c424e6ada8f06ec6d16b23605ffa0eb16e8160 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Wed, 29 Jun 2022 23:14:43 +0700 Subject: [PATCH 167/297] Update pretrain_t0.py --- pretrain_t0.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pretrain_t0.py b/pretrain_t0.py index 7e6813040..7ccb45830 100644 --- a/pretrain_t0.py +++ b/pretrain_t0.py @@ -160,7 +160,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_path_dict = [json.loads(args.data_path)] train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, + data_prefix=data_path_dict, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, @@ -206,4 +206,4 @@ def main(): args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) if __name__ == "__main__": - main() \ No newline at end of file + main() From 51d6c4025f84239fa7d344e3799b7633d827690c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Wed, 29 Jun 2022 18:52:57 +0200 Subject: [PATCH 168/297] Remove a bunch of code that's not needed --- examples/finetune_t0.sh | 44 ---- megatron/utils.py | 56 ----- pretrain_t0.py | 209 ------------------ .../t0/ag_news_classify_question_first.json | 100 --------- .../t0/ag_news_prompt_inputs_document.bin | Bin 12038 -> 0 bytes .../t0/ag_news_prompt_inputs_document.idx | Bin 2042 -> 0 bytes .../data/t0/ag_news_prompt_text_document.bin | Bin 12526 -> 0 bytes .../data/t0/ag_news_prompt_text_document.idx | Bin 2042 -> 0 bytes 8 files changed, 409 deletions(-) delete mode 100644 examples/finetune_t0.sh delete mode 100644 pretrain_t0.py delete mode 100644 tests/data/t0/ag_news_classify_question_first.json delete mode 100644 tests/data/t0/ag_news_prompt_inputs_document.bin delete mode 100644 tests/data/t0/ag_news_prompt_inputs_document.idx delete mode 100644 tests/data/t0/ag_news_prompt_text_document.bin delete mode 100644 tests/data/t0/ag_news_prompt_text_document.idx diff --git a/examples/finetune_t0.sh b/examples/finetune_t0.sh deleted file mode 100644 index 1bf1e81a6..000000000 --- a/examples/finetune_t0.sh +++ /dev/null @@ -1,44 +0,0 @@ -#! /bin/bash - -# Runs the "345M" parameter model - -RANK=0 -WORLD_SIZE=1 - -DATA_PATH="{ \ - 'input_tokens': 'tests/data/t0/ag_news_prompt_inputs_document', \ - 'target_tokens': 'tests/data/t0/ag_news_prompt_targets_document' \ - }" -CHECKPOINT_PATH="./checkpoints" -TOKENIZER_PATH=gpt2 - -deepspeed --num_gpus 1 pretrain_t0.py \ - --num-layers 24 \ - --hidden-size 1024 \ - --num-attention-heads 16 \ - --micro-batch-size 4 \ - --global-batch-size 8 \ - --seq-length 1024 \ - --max-position-embeddings 1024 \ - --train-iters 500000 \ - --lr-decay-iters 320000 \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path $TOKENIZER_PATH \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - --lr 0.00015 \ - --min-lr 1.0e-5 \ - --lr-decay-style cosine \ - --weight-decay 1e-2 \ - --clip-grad 1.0 \ - --lr-warmup-fraction .01 \ - --checkpoint-activations \ - --log-interval 100 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 10 \ - --fp16 diff --git a/megatron/utils.py b/megatron/utils.py index 799196c10..98d2f611c 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -250,62 +250,6 @@ def get_ltor_masks_and_position_ids( return attention_mask, loss_mask, position_ids -def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, segment_ids, datatype=torch.int64): - """ - Inspired by https://github.com/google-research/t5x/blob/7193407f98a8b18100b71a04ff777238be1682ca/t5x/examples/decoder_only/layers.py#L978 - """ - inputs_mask = decoder_causal_attention.unsqueeze(-1) * decoder_causal_attention.unsqueeze(1) - inputs_mask = inputs_mask.unsqueeze(1) - - """Causal Inputs Mask: - mask = [[[[1, 1, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0, 0], - [1, 1, 1, 0, 0, 0, 0], - [1, 1, 1, 1, 1, 0, 0], - [1, 1, 1, 1, 1, 0, 0], - [1, 1, 1, 1, 1, 1, 0], - [1, 1, 1, 1, 1, 1, 1]]]] - """ - causal_inputs_mask = torch.logical_or(causal_mask, inputs_mask).to(datatype) - - """Padding Mask: - mask = [[[[1, 1, 1, 1, 1, 1, 0], - [1, 1, 1, 1, 1, 1, 0], - [1, 1, 1, 1, 1, 1, 0], - [1, 1, 1, 1, 1, 1, 0], - [1, 1, 1, 1, 1, 1, 0], - [1, 1, 1, 1, 1, 1, 0], - [0, 0, 0, 0, 0, 0, 0]]]] - """ - padding_mask = (tokens > 0).unsqueeze(-1) * (tokens > 0).unsqueeze(1) - padding_mask = padding_mask.unsqueeze(1) - - - """Segment Mask: - mask = [[[[1, 1, 1, 0, 0, 0, 0], - [1, 1, 1, 0, 0, 0, 0], - [1, 1, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 1, 1, 0], - [0, 0, 0, 1, 1, 1, 0], - [0, 0, 0, 1, 1, 1, 0], - [0, 0, 0, 0, 0, 0, 0]]]] - """ - segment_mask = (segment_ids.unsqueeze(-1)) == (segment_ids.unsqueeze(1)) - segment_mask = segment_mask.unsqueeze(1) - - """Final Mask: - mask = [[[[1, 1, 0, 0, 0, 0, 0], - [1, 1, 0, 0, 0, 0, 0], - [1, 1, 1, 0, 0, 0, 0], - [0, 0, 0, 1, 1, 0, 0], - [0, 0, 0, 1, 1, 0, 0], - [0, 0, 0, 1, 1, 1, 0], - [0, 0, 0, 0, 0, 0, 0]]]] - """ - attention_mask = causal_inputs_mask * padding_mask * segment_mask - - return attention_mask - def param_size(parameter): return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement() diff --git a/pretrain_t0.py b/pretrain_t0.py deleted file mode 100644 index 7ccb45830..000000000 --- a/pretrain_t0.py +++ /dev/null @@ -1,209 +0,0 @@ -"""Multitask Finetuning T0""" - -import torch -from functools import partial - -from megatron import get_args, get_tokenizer, get_timers, print_rank_0, mpu -from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group -from megatron.model import GPTModel, GPTModelPipe -from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask - -import deepspeed -from deepspeed.runtime.utils import see_memory_usage -import os - -try: - from torch.distributed.elastic.multiprocessing.errors import record -except ImportError: - # noop - def record(fn): - return fn - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - - args = get_args() - - with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config, - enabled=args.zero_stage == 3, - mpu=mpu): - if args.deepspeed: - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True - ) - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe_packed - else: - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process - ) - see_memory_usage(f"After Building Model", force=True) - return model - - -def get_batch_pipe_packed(data): - """ - Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator` & in packed fashion - - data: - decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]] - decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]] - decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]] - """ - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = ['decoder_target_tokens', 'decoder_segment_ids', 'decoder_causal_attention'] - datatype = torch.int64 - - # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - tokens_ = data_b['decoder_target_tokens'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - segment_ids = data_b['decoder_segment_ids'].long()[:, :-1] - decoder_causal_attention = data_b['decoder_causal_attention'].long()[:, :-1] - - # Get the masks and position ids. - causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=None, - loss_on_targets_only=False # This is done below - ) - # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding - loss_mask *= torch.logical_and((decoder_causal_attention - 1) * -1, tokens) - loss_mask = loss_mask.to(datatype) - - attention_mask = get_packed_attention_mask( - causal_mask=causal_mask, - tokens=tokens, - decoder_causal_attention=decoder_causal_attention, - segment_ids=segment_ids, - datatype=datatype, - ) - - if args.curriculum_learning and args.curriculum_seqlen < tokens.size()[1]: - # seqlen-based curriculum learning - # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] - tokens = tokens[:, :args.curriculum_seqlen].contiguous() - position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() - labels = labels[:, :args.curriculum_seqlen].contiguous() - loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() - - return (tokens, position_ids, attention_mask), (labels, loss_mask) - - -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - -def forward_step(data_iterator, model): - """Forward step.""" - args = get_args() - timers = get_timers() - - # Get the batch. - timers('batch-generator').start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - - output_tensor = model(tokens, position_ids, attention_mask, - labels=labels) - if args.curriculum_learning and args.curriculum_seqlen < args.seq_length: - loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() - - return output_tensor, partial(loss_func, loss_mask) - - -def train_valid_test_datasets_provider(train_val_test_num_samples): - """Build train, valid, and test datasets.""" - args = get_args() - train_ds, valid_ds, test_ds = None, None, None - - print_rank_0('> building train, validation, and test datasets for GPT ...') - # Option 1 of data loading using --data-path - - if args.data_path: - - import json - data_path_dict = [json.loads(args.data_path)] - - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=data_path_dict, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, - seed=args.seed, - skip_warmup=(not args.mmap_warmup)) - - # # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - # elif args.train_weighted_split_paths: - # assigned_train_valid_test = [] - # if args.train_weighted_split_paths is not None: - # train_ds = [] - # assigned_train_valid_test.append("train") - # if args.valid_weighted_split_paths is not None: - # valid_ds = [] - # assigned_train_valid_test.append("valid") - # if args.test_weighted_split_paths is not None: - # test_ds = [] - # assigned_train_valid_test.append("test") - - # for s in assigned_train_valid_test: - # data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - # eval(f"args.{s}_weighted_split_weights"), - # eval(f"args.{s}_weighted_split_splits"), - # eval(f"args.{s}_weighted_split_names")) - # for paths, weights, splits, name in data_groups: - # d = build_dataset_group(name, paths, weights, splits, - # args.data_impl, - # train_val_test_num_samples, - # args.seq_length, args.seed, - # (not args.mmap_warmup), - # train_valid_test=s) - # eval(f"{s}_ds").append(d) - else: - raise NotImplementedError("No dataloading argument passed") - - print_rank_0("> finished creating T0 datasets ...") - return train_ds, valid_ds, test_ds - -@record -def main(): - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) - -if __name__ == "__main__": - main() diff --git a/tests/data/t0/ag_news_classify_question_first.json b/tests/data/t0/ag_news_classify_question_first.json deleted file mode 100644 index 3f82f55d2..000000000 --- a/tests/data/t0/ag_news_classify_question_first.json +++ /dev/null @@ -1,100 +0,0 @@ -{"text": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", "targets": "Business"} -{"text": "What label best describes this news article?\nCarlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.<|endoftext|>Business", "inputs": "What label best describes this news article?\nCarlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.", "targets": "Business"} -{"text": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.", "targets": "Business"} -{"text": "What label best describes this news article?\nIraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.", "targets": "Business"} -{"text": "What label best describes this news article?\nOil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections.", "targets": "Business"} -{"text": "What label best describes this news article?\nStocks End Up, But Near Year Lows (Reuters) Reuters - Stocks ended slightly higher on Friday\\but stayed near lows for the year as oil prices surged past #36;46\\a barrel, offsetting a positive outlook from computer maker\\Dell Inc. (DELL.O)<|endoftext|>Business", "inputs": "What label best describes this news article?\nStocks End Up, But Near Year Lows (Reuters) Reuters - Stocks ended slightly higher on Friday\\but stayed near lows for the year as oil prices surged past #36;46\\a barrel, offsetting a positive outlook from computer maker\\Dell Inc. (DELL.O)", "targets": "Business"} -{"text": "What label best describes this news article?\nMoney Funds Fell in Latest Week (AP) AP - Assets of the nation's retail money market mutual funds fell by #36;1.17 billion in the latest week to #36;849.98 trillion, the Investment Company Institute said Thursday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMoney Funds Fell in Latest Week (AP) AP - Assets of the nation's retail money market mutual funds fell by #36;1.17 billion in the latest week to #36;849.98 trillion, the Investment Company Institute said Thursday.", "targets": "Business"} -{"text": "What label best describes this news article?\nFed minutes show dissent over inflation (USATODAY.com) USATODAY.com - Retail sales bounced back a bit in July, and new claims for jobless benefits fell last week, the government said Thursday, indicating the economy is improving from a midsummer slump.<|endoftext|>Business", "inputs": "What label best describes this news article?\nFed minutes show dissent over inflation (USATODAY.com) USATODAY.com - Retail sales bounced back a bit in July, and new claims for jobless benefits fell last week, the government said Thursday, indicating the economy is improving from a midsummer slump.", "targets": "Business"} -{"text": "What label best describes this news article?\nSafety Net (Forbes.com) Forbes.com - After earning a PH.D. in Sociology, Danny Bazil Riley started to work as the general manager at a commercial real estate firm at an annual base salary of #36;70,000. Soon after, a financial planner stopped by his desk to drop off brochures about insurance benefits available through his employer. But, at 32, \"buying insurance was the furthest thing from my mind,\" says Riley.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSafety Net (Forbes.com) Forbes.com - After earning a PH.D. in Sociology, Danny Bazil Riley started to work as the general manager at a commercial real estate firm at an annual base salary of #36;70,000. Soon after, a financial planner stopped by his desk to drop off brochures about insurance benefits available through his employer. But, at 32, \"buying insurance was the furthest thing from my mind,\" says Riley.", "targets": "Business"} -{"text": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWall St. Bears Claw Back Into the Black NEW YORK (Reuters) - Short-sellers, Wall Street's dwindling band of ultra-cynics, are seeing green again.", "targets": "Business"} -{"text": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook NEW YORK (Reuters) - Soaring crude prices plus worries about the economy and the outlook for earnings are expected to hang over the stock market next week during the depth of the summer doldrums.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOil and Economy Cloud Stocks' Outlook NEW YORK (Reuters) - Soaring crude prices plus worries about the economy and the outlook for earnings are expected to hang over the stock market next week during the depth of the summer doldrums.", "targets": "Business"} -{"text": "What label best describes this news article?\nNo Need for OPEC to Pump More-Iran Gov TEHRAN (Reuters) - OPEC can do nothing to douse scorching oil prices when markets are already oversupplied by 2.8 million barrels per day (bpd) of crude, Iran's OPEC governor said Saturday, warning that prices could fall sharply.<|endoftext|>Business", "inputs": "What label best describes this news article?\nNo Need for OPEC to Pump More-Iran Gov TEHRAN (Reuters) - OPEC can do nothing to douse scorching oil prices when markets are already oversupplied by 2.8 million barrels per day (bpd) of crude, Iran's OPEC governor said Saturday, warning that prices could fall sharply.", "targets": "Business"} -{"text": "What label best describes this news article?\nNon-OPEC Nations Should Up Output-Purnomo JAKARTA (Reuters) - Non-OPEC oil exporters should consider increasing output to cool record crude prices, OPEC President Purnomo Yusgiantoro said on Sunday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nNon-OPEC Nations Should Up Output-Purnomo JAKARTA (Reuters) - Non-OPEC oil exporters should consider increasing output to cool record crude prices, OPEC President Purnomo Yusgiantoro said on Sunday.", "targets": "Business"} -{"text": "What label best describes this news article?\nGoogle IPO Auction Off to Rocky Start WASHINGTON/NEW YORK (Reuters) - The auction for Google Inc.'s highly anticipated initial public offering got off to a rocky start on Friday after the Web search company sidestepped a bullet from U.S. securities regulators.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle IPO Auction Off to Rocky Start WASHINGTON/NEW YORK (Reuters) - The auction for Google Inc.'s highly anticipated initial public offering got off to a rocky start on Friday after the Web search company sidestepped a bullet from U.S. securities regulators.", "targets": "Business"} -{"text": "What label best describes this news article?\nDollar Falls Broadly on Record Trade Gap NEW YORK (Reuters) - The dollar tumbled broadly on Friday after data showing a record U.S. trade deficit in June cast fresh doubts on the economy's recovery and its ability to draw foreign capital to fund the growing gap.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDollar Falls Broadly on Record Trade Gap NEW YORK (Reuters) - The dollar tumbled broadly on Friday after data showing a record U.S. trade deficit in June cast fresh doubts on the economy's recovery and its ability to draw foreign capital to fund the growing gap.", "targets": "Business"} -{"text": "What label best describes this news article?\nRescuing an Old Saver If you think you may need to help your elderly relatives with their finances, don't be shy about having the money talk -- soon.<|endoftext|>Business", "inputs": "What label best describes this news article?\nRescuing an Old Saver If you think you may need to help your elderly relatives with their finances, don't be shy about having the money talk -- soon.", "targets": "Business"} -{"text": "What label best describes this news article?\nKids Rule for Back-to-School The purchasing power of kids is a big part of why the back-to-school season has become such a huge marketing phenomenon.<|endoftext|>Business", "inputs": "What label best describes this news article?\nKids Rule for Back-to-School The purchasing power of kids is a big part of why the back-to-school season has become such a huge marketing phenomenon.", "targets": "Business"} -{"text": "What label best describes this news article?\nIn a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market these days, but investors in value-focused mutual funds have reason to feel a bit smug -- if only because they've lost less than the folks who stuck with growth.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIn a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market these days, but investors in value-focused mutual funds have reason to feel a bit smug -- if only because they've lost less than the folks who stuck with growth.", "targets": "Business"} -{"text": "What label best describes this news article?\nUS trade deficit swells in June The US trade deficit has exploded 19 to a record \\$55.8bn as oil costs drove imports higher, according to a latest figures.<|endoftext|>Business", "inputs": "What label best describes this news article?\nUS trade deficit swells in June The US trade deficit has exploded 19 to a record \\$55.8bn as oil costs drove imports higher, according to a latest figures.", "targets": "Business"} -{"text": "What label best describes this news article?\nShell 'could be target for Total' Oil giant Shell could be bracing itself for a takeover attempt, possibly from French rival Total, a press report claims.<|endoftext|>Business", "inputs": "What label best describes this news article?\nShell 'could be target for Total' Oil giant Shell could be bracing itself for a takeover attempt, possibly from French rival Total, a press report claims.", "targets": "Business"} -{"text": "What label best describes this news article?\nGoogle IPO faces Playboy slip-up The bidding gets underway for Google's public offering, despite last-minute worries over an interview with its bosses in Playboy magazine.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle IPO faces Playboy slip-up The bidding gets underway for Google's public offering, despite last-minute worries over an interview with its bosses in Playboy magazine.", "targets": "Business"} -{"text": "What label best describes this news article?\nEurozone economy keeps growing Official figures show the 12-nation eurozone economy continues to grow, but there are warnings it may slow down later in the year.<|endoftext|>Business", "inputs": "What label best describes this news article?\nEurozone economy keeps growing Official figures show the 12-nation eurozone economy continues to grow, but there are warnings it may slow down later in the year.", "targets": "Business"} -{"text": "What label best describes this news article?\nExpansion slows in Japan Economic growth in Japan slows down as the country experiences a drop in domestic and corporate spending.<|endoftext|>Business", "inputs": "What label best describes this news article?\nExpansion slows in Japan Economic growth in Japan slows down as the country experiences a drop in domestic and corporate spending.", "targets": "Business"} -{"text": "What label best describes this news article?\nRand falls on shock SA rate cut Interest rates are trimmed to 7.5 by the South African central bank, but the lack of warning hits the rand and surprises markets.<|endoftext|>Business", "inputs": "What label best describes this news article?\nRand falls on shock SA rate cut Interest rates are trimmed to 7.5 by the South African central bank, but the lack of warning hits the rand and surprises markets.", "targets": "Business"} -{"text": "What label best describes this news article?\nCar prices down across the board The cost of buying both new and second hand cars fell sharply over the past five years, a new survey has found.<|endoftext|>Business", "inputs": "What label best describes this news article?\nCar prices down across the board The cost of buying both new and second hand cars fell sharply over the past five years, a new survey has found.", "targets": "Business"} -{"text": "What label best describes this news article?\nSouth Korea lowers interest rates South Korea's central bank cuts interest rates by a quarter percentage point to 3.5 in a bid to drive growth in the economy.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSouth Korea lowers interest rates South Korea's central bank cuts interest rates by a quarter percentage point to 3.5 in a bid to drive growth in the economy.", "targets": "Business"} -{"text": "What label best describes this news article?\nGoogle auction begins on Friday An auction of shares in Google, the web search engine which could be floated for as much as \\$36bn, takes place on Friday.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle auction begins on Friday An auction of shares in Google, the web search engine which could be floated for as much as \\$36bn, takes place on Friday.", "targets": "Business"} -{"text": "What label best describes this news article?\nHP shares tumble on profit news Hewlett-Packard shares fall after disappointing third-quarter profits, while the firm warns the final quarter will also fall short of expectations.<|endoftext|>Business", "inputs": "What label best describes this news article?\nHP shares tumble on profit news Hewlett-Packard shares fall after disappointing third-quarter profits, while the firm warns the final quarter will also fall short of expectations.", "targets": "Business"} -{"text": "What label best describes this news article?\nMauritian textile firm cuts jobs One of the oldest textile operators on the Indian Ocean island of Mauritius last week shut seven factories and cut 900 jobs.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMauritian textile firm cuts jobs One of the oldest textile operators on the Indian Ocean island of Mauritius last week shut seven factories and cut 900 jobs.", "targets": "Business"} -{"text": "What label best describes this news article?\nChad seeks refugee aid from IMF Chad asks the IMF for a loan to pay for looking after more than 100,000 refugees from conflict-torn Darfur in western Sudan.<|endoftext|>Business", "inputs": "What label best describes this news article?\nChad seeks refugee aid from IMF Chad asks the IMF for a loan to pay for looking after more than 100,000 refugees from conflict-torn Darfur in western Sudan.", "targets": "Business"} -{"text": "What label best describes this news article?\nJapan nuclear firm shuts plants The company running the Japanese nuclear plant hit by a fatal accident is to close its reactors for safety checks.<|endoftext|>Business", "inputs": "What label best describes this news article?\nJapan nuclear firm shuts plants The company running the Japanese nuclear plant hit by a fatal accident is to close its reactors for safety checks.", "targets": "Business"} -{"text": "What label best describes this news article?\nVeteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freeplay clockwork radio, is planning to float his company on the stock market.<|endoftext|>Business", "inputs": "What label best describes this news article?\nVeteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freeplay clockwork radio, is planning to float his company on the stock market.", "targets": "Business"} -{"text": "What label best describes this news article?\nSaudi Arabia to open up oil taps Saudi Arabia says it is ready to push an extra 1.3 million barrels a day of oil into the market, to help reverse surging prices.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSaudi Arabia to open up oil taps Saudi Arabia says it is ready to push an extra 1.3 million barrels a day of oil into the market, to help reverse surging prices.", "targets": "Business"} -{"text": "What label best describes this news article?\nSaudi phone sector gets \\$1bn lift A group led by the UAE's Etisalat plans to spend \\$1bn (544m) on expansion after winning two mobile phone licences in Saudi Arabia.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSaudi phone sector gets \\$1bn lift A group led by the UAE's Etisalat plans to spend \\$1bn (544m) on expansion after winning two mobile phone licences in Saudi Arabia.", "targets": "Business"} -{"text": "What label best describes this news article?\nIndians fill rail skills shortage Network Rail flies in specialist Indian engineers to work on the West Coast Mainline because of a UK skills shortage.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIndians fill rail skills shortage Network Rail flies in specialist Indian engineers to work on the West Coast Mainline because of a UK skills shortage.", "targets": "Business"} -{"text": "What label best describes this news article?\nSteady as they go BEDFORD -- Scientists at NitroMed Inc. hope their experimental drugs will cure heart disease someday. But lately their focus has been on more mundane matters.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSteady as they go BEDFORD -- Scientists at NitroMed Inc. hope their experimental drugs will cure heart disease someday. But lately their focus has been on more mundane matters.", "targets": "Business"} -{"text": "What label best describes this news article?\nGoogle IPO: Type in 'confusing,' 'secrecy' I've submitted my bid to buy shares of Google Inc. in the computer search company's giant auction-style initial public offering. That could turn out to be the good news or the bad news.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGoogle IPO: Type in 'confusing,' 'secrecy' I've submitted my bid to buy shares of Google Inc. in the computer search company's giant auction-style initial public offering. That could turn out to be the good news or the bad news.", "targets": "Business"} -{"text": "What label best describes this news article?\nA bargain hunter's paradise Massachusetts bargain hunters showed up in droves and shopped hard on yesterday's sales tax holiday, buying everything from treadmills and snow blowers to candles and chandeliers, and crediting the 5-percent tax break with bringing them into the stores.<|endoftext|>Business", "inputs": "What label best describes this news article?\nA bargain hunter's paradise Massachusetts bargain hunters showed up in droves and shopped hard on yesterday's sales tax holiday, buying everything from treadmills and snow blowers to candles and chandeliers, and crediting the 5-percent tax break with bringing them into the stores.", "targets": "Business"} -{"text": "What label best describes this news article?\nResearchers seek to untangle the e-mail thread E-mail is a victim of its own success. That's the conclusion of IBM Corp. researchers in Cambridge, who have spent nearly a decade conducting field tests at IBM and other companies about how employees work and use electronic mail. It's clear to them that e-mail has become the Internet's killer application.<|endoftext|>Business", "inputs": "What label best describes this news article?\nResearchers seek to untangle the e-mail thread E-mail is a victim of its own success. That's the conclusion of IBM Corp. researchers in Cambridge, who have spent nearly a decade conducting field tests at IBM and other companies about how employees work and use electronic mail. It's clear to them that e-mail has become the Internet's killer application.", "targets": "Business"} -{"text": "What label best describes this news article?\nMicrosoft Corp. 2.0: a kinder corporate culture Even a genius can mess up. Bill Gates was a brilliant technologist when he cofounded Microsoft , but as he guided it to greatness in both size and historical consequence, he blundered. He terrorized underlings with his temper and parceled out praise like Scrooge gave to charity. Only the lash inspired the necessary aggressiveness to beat the competition, he thought.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMicrosoft Corp. 2.0: a kinder corporate culture Even a genius can mess up. Bill Gates was a brilliant technologist when he cofounded Microsoft , but as he guided it to greatness in both size and historical consequence, he blundered. He terrorized underlings with his temper and parceled out praise like Scrooge gave to charity. Only the lash inspired the necessary aggressiveness to beat the competition, he thought.", "targets": "Business"} -{"text": "What label best describes this news article?\nLetters Target the abusers of legal weapons We can all share the outrage, expressed by columnist Steve Bailey (''Summer Sizzler, quot; Aug. 11), at the killings in the city's poor neighborhoods. But there's no need to share his ignorance. He argues for renewal of the so-called assault weapon ban, claiming that otherwise, ''UZIs and AK-47s could again be flooding the streets. quot; His ...<|endoftext|>Business", "inputs": "What label best describes this news article?\nLetters Target the abusers of legal weapons We can all share the outrage, expressed by columnist Steve Bailey (''Summer Sizzler, quot; Aug. 11), at the killings in the city's poor neighborhoods. But there's no need to share his ignorance. He argues for renewal of the so-called assault weapon ban, claiming that otherwise, ''UZIs and AK-47s could again be flooding the streets. quot; His ...", "targets": "Business"} -{"text": "What label best describes this news article?\nSomewhere between gleam and gloom President Bush has been saying that the US economy has ''turned the corner. quot; Democratic presidential candidate Senator John F. Kerry, in the wake of this month's poor jobs report, quipped that it was more like a U-turn.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSomewhere between gleam and gloom President Bush has been saying that the US economy has ''turned the corner. quot; Democratic presidential candidate Senator John F. Kerry, in the wake of this month's poor jobs report, quipped that it was more like a U-turn.", "targets": "Business"} -{"text": "What label best describes this news article?\nTechnology company sues five ex-employees A Marlborough-based technology company is suing five former employees, including three senior managers, for allegedly conspiring against their employer while working on opening a competing business.<|endoftext|>Business", "inputs": "What label best describes this news article?\nTechnology company sues five ex-employees A Marlborough-based technology company is suing five former employees, including three senior managers, for allegedly conspiring against their employer while working on opening a competing business.", "targets": "Business"} -{"text": "What label best describes this news article?\nGrant to aid Lynn Central Square Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.<|endoftext|>Business", "inputs": "What label best describes this news article?\nGrant to aid Lynn Central Square Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.", "targets": "Business"} -{"text": "What label best describes this news article?\nState grant to aid Lynn; Bank gives Salem \\$10k Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.<|endoftext|>Business", "inputs": "What label best describes this news article?\nState grant to aid Lynn; Bank gives Salem \\$10k Central Square in Lynn should be looking a bit brighter. New sidewalks, curbs, fences, lights, landscaping, and road improvements are planned for the Gateway Artisan Block, a key area of the square, with \\$830,000 in state grant money given to Lynn last week.", "targets": "Business"} -{"text": "What label best describes this news article?\nA New Legal Chapter for a 90's Flameout A lawsuit against Gary Winnick, the former chief of Global Crossing, refocuses attention on what Mr. Winnick knew about his company's finances as it imploded.<|endoftext|>Business", "inputs": "What label best describes this news article?\nA New Legal Chapter for a 90's Flameout A lawsuit against Gary Winnick, the former chief of Global Crossing, refocuses attention on what Mr. Winnick knew about his company's finances as it imploded.", "targets": "Business"} -{"text": "What label best describes this news article?\nWill Russia, the Oil Superpower, Flex Its Muscles? Russia is again emerging as a superpower - but the reason has less to do with nuclear weapons than with oil.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWill Russia, the Oil Superpower, Flex Its Muscles? Russia is again emerging as a superpower - but the reason has less to do with nuclear weapons than with oil.", "targets": "Business"} -{"text": "What label best describes this news article?\nSwitching Titles, if Not Gears, at Dell Kevin B. Rollins, the new chief executive of Dell, talks about Dell's transitory slip in customer service, and why he sees a broader technology recovery taking place.<|endoftext|>Business", "inputs": "What label best describes this news article?\nSwitching Titles, if Not Gears, at Dell Kevin B. Rollins, the new chief executive of Dell, talks about Dell's transitory slip in customer service, and why he sees a broader technology recovery taking place.", "targets": "Business"} -{"text": "What label best describes this news article?\nFor Sale: The Ultimate Status Symbol With the country in need of cash and rich people dying to show off their wealth, Mr. Stein proposes a unique solution: having the government sell titles of nobility.<|endoftext|>Business", "inputs": "What label best describes this news article?\nFor Sale: The Ultimate Status Symbol With the country in need of cash and rich people dying to show off their wealth, Mr. Stein proposes a unique solution: having the government sell titles of nobility.", "targets": "Business"} -{"text": "What label best describes this news article?\nQuality Gets Swept Away Quality Distribution is hammered after reporting a large loss for the second quarter.<|endoftext|>Business", "inputs": "What label best describes this news article?\nQuality Gets Swept Away Quality Distribution is hammered after reporting a large loss for the second quarter.", "targets": "Business"} -{"text": "What label best describes this news article?\nMaking Your Insurer Pay If Hurricane Charley blows your house down, how can you make your insurance company pay?<|endoftext|>Business", "inputs": "What label best describes this news article?\nMaking Your Insurer Pay If Hurricane Charley blows your house down, how can you make your insurance company pay?", "targets": "Business"} -{"text": "What label best describes this news article?\nDelightful Dell The company's results show that it's not grim all over tech world. Just all of it that isn't Dell.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDelightful Dell The company's results show that it's not grim all over tech world. Just all of it that isn't Dell.", "targets": "Business"} -{"text": "What label best describes this news article?\nChrysler's Bling King After a tough year, Detroit's troubled carmaker is back -- thanks to a maverick designer and a car that is dazzling the hip-hop crowd<|endoftext|>Business", "inputs": "What label best describes this news article?\nChrysler's Bling King After a tough year, Detroit's troubled carmaker is back -- thanks to a maverick designer and a car that is dazzling the hip-hop crowd", "targets": "Business"} -{"text": "What label best describes this news article?\nWhat's Cool In the Pool ... ... And Hot On the Deck Americans are spending more on tricking out the places where they swim. Here's a look at the new wave of accessories<|endoftext|>Business", "inputs": "What label best describes this news article?\nWhat's Cool In the Pool ... ... And Hot On the Deck Americans are spending more on tricking out the places where they swim. Here's a look at the new wave of accessories", "targets": "Business"} -{"text": "What label best describes this news article?\nThe Age of Doom In 1993 six geeks had a digital nightmare that changed the culture. It's about to get far creepier<|endoftext|>Business", "inputs": "What label best describes this news article?\nThe Age of Doom In 1993 six geeks had a digital nightmare that changed the culture. It's about to get far creepier", "targets": "Business"} -{"text": "What label best describes this news article?\nHip Hop's Online Shop Celebrity fashion is booming. These webpreneurs are bringing it to main street<|endoftext|>Business", "inputs": "What label best describes this news article?\nHip Hop's Online Shop Celebrity fashion is booming. These webpreneurs are bringing it to main street", "targets": "Business"} -{"text": "What label best describes this news article?\nStoking the Steamroller No other recording artist can channel American middle-class tastes quite like Chip Davis and his best-selling band<|endoftext|>Business", "inputs": "What label best describes this news article?\nStoking the Steamroller No other recording artist can channel American middle-class tastes quite like Chip Davis and his best-selling band", "targets": "Business"} -{"text": "What label best describes this news article?\nComing to The Rescue Got a unique problem? Not to worry: you can find a financial planner for every specialized need<|endoftext|>Business", "inputs": "What label best describes this news article?\nComing to The Rescue Got a unique problem? Not to worry: you can find a financial planner for every specialized need", "targets": "Business"} -{"text": "What label best describes this news article?\nThe New Customers Are In Town Today's customers are increasingly demanding, in Asia as elsewhere in the world. Henry Astorga describes the complex reality faced by today's marketers, which includes much higher expectations than we have been used to. Today's customers want performance, and they want it now!<|endoftext|>Business", "inputs": "What label best describes this news article?\nThe New Customers Are In Town Today's customers are increasingly demanding, in Asia as elsewhere in the world. Henry Astorga describes the complex reality faced by today's marketers, which includes much higher expectations than we have been used to. Today's customers want performance, and they want it now!", "targets": "Business"} -{"text": "What label best describes this news article?\nBarrel of Monkeys, 2004 Edition: Notes on Philippine Elections Well, it's election time in the Republic of the Philippines, and that means the monkeys are rolling around in those political barrels, having as much fun as they can while laughing their heads off at the strange goings-on that characterize a democratic process loosely based on the American model but that de facto looks more like a Fellini movie crossed with a Tom and Jerry cartoon - column includes a useful election-year glossary!<|endoftext|>Business", "inputs": "What label best describes this news article?\nBarrel of Monkeys, 2004 Edition: Notes on Philippine Elections Well, it's election time in the Republic of the Philippines, and that means the monkeys are rolling around in those political barrels, having as much fun as they can while laughing their heads off at the strange goings-on that characterize a democratic process loosely based on the American model but that de facto looks more like a Fellini movie crossed with a Tom and Jerry cartoon - column includes a useful election-year glossary!", "targets": "Business"} -{"text": "What label best describes this news article?\nOldsmobile: The final parking lot Why General Motors dropped the Oldsmobile. The four brand paradoxes GM had to face - the name, the product, image re-positioning, and the consumer - all added up to a brand that had little hope of rebranding.<|endoftext|>Business", "inputs": "What label best describes this news article?\nOldsmobile: The final parking lot Why General Motors dropped the Oldsmobile. The four brand paradoxes GM had to face - the name, the product, image re-positioning, and the consumer - all added up to a brand that had little hope of rebranding.", "targets": "Business"} -{"text": "What label best describes this news article?\nNot All Jobs Belong To The White Man: Asian Minorities, Affirmative Action, And The Quest For Parity At Work Although a smattering of Chinese, Filipinos, Japanese, Indians, Thais, and others may crow about seeing their kind sitting in prominent positions in corporations and organizations in the USA, these accomplishments become mere cultural high-fives and ritualistic chest-thumping goaded and impishly patronized by 'mainstream society' - the milder and gentler term for the white-dominated populace.<|endoftext|>Business", "inputs": "What label best describes this news article?\nNot All Jobs Belong To The White Man: Asian Minorities, Affirmative Action, And The Quest For Parity At Work Although a smattering of Chinese, Filipinos, Japanese, Indians, Thais, and others may crow about seeing their kind sitting in prominent positions in corporations and organizations in the USA, these accomplishments become mere cultural high-fives and ritualistic chest-thumping goaded and impishly patronized by 'mainstream society' - the milder and gentler term for the white-dominated populace.", "targets": "Business"} -{"text": "What label best describes this news article?\nDownhome Pinoy Blues, Intersecting Life Paths, and Heartbreak Songs The Blues is alive and well in the Philippines, as evidenced by this appreciation of the Pinoy Blues band 'Lampano Alley', penned by columnist Clarence Henderson as a counterpoint to his usual economics, business, and culture fare.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDownhome Pinoy Blues, Intersecting Life Paths, and Heartbreak Songs The Blues is alive and well in the Philippines, as evidenced by this appreciation of the Pinoy Blues band 'Lampano Alley', penned by columnist Clarence Henderson as a counterpoint to his usual economics, business, and culture fare.", "targets": "Business"} -{"text": "What label best describes this news article?\nThe Real Time Modern Manila Blues: Bill Monroe Meets Muddy Waters in the Orient Globalization does strange things to people. A day in the life of a Manila Philippines based business consultant - proving that you really CAN talk about Muddy Walters, bluegrass and work all on the same page...<|endoftext|>Business", "inputs": "What label best describes this news article?\nThe Real Time Modern Manila Blues: Bill Monroe Meets Muddy Waters in the Orient Globalization does strange things to people. A day in the life of a Manila Philippines based business consultant - proving that you really CAN talk about Muddy Walters, bluegrass and work all on the same page...", "targets": "Business"} -{"text": "What label best describes this news article?\nBest Asian Tourism Destinations The new APMF survey of the best Asian tourism destinations has just kicked off, but it's crowded at the top, with Chiang Mai in Thailand just leading from perennial favourites Hong Kong, Bangkok and Phuket in Thailand, and Bali in Indonesia. Be one of the first to vote and let us know your reasons.<|endoftext|>Business", "inputs": "What label best describes this news article?\nBest Asian Tourism Destinations The new APMF survey of the best Asian tourism destinations has just kicked off, but it's crowded at the top, with Chiang Mai in Thailand just leading from perennial favourites Hong Kong, Bangkok and Phuket in Thailand, and Bali in Indonesia. Be one of the first to vote and let us know your reasons.", "targets": "Business"} -{"text": "What label best describes this news article?\nWhat are the best cities for business in Asia? One of our new categories in the APMF Sense of Place survey is for best Asian business city. After a couple of days, Singapore leads the pack, followed by Bangkok, Thailand and Hong Kong. Enter your vote and comments and make your views count. More new categories include best city for livability, and best tourism destinations.<|endoftext|>Business", "inputs": "What label best describes this news article?\nWhat are the best cities for business in Asia? One of our new categories in the APMF Sense of Place survey is for best Asian business city. After a couple of days, Singapore leads the pack, followed by Bangkok, Thailand and Hong Kong. Enter your vote and comments and make your views count. More new categories include best city for livability, and best tourism destinations.", "targets": "Business"} -{"text": "What label best describes this news article?\nIT alligator tales I grew up in New York, where giant alligators -- sometimes more ornately described as albino alligators -- were rumored to roam the citys sewer systems. According to legend, vacationers picked up the tiny crocodilians in Florida, brought them home to New York, and eventually flushed the little buggers when they grew too big for the local concrete jungle.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIT alligator tales I grew up in New York, where giant alligators -- sometimes more ornately described as albino alligators -- were rumored to roam the citys sewer systems. According to legend, vacationers picked up the tiny crocodilians in Florida, brought them home to New York, and eventually flushed the little buggers when they grew too big for the local concrete jungle.", "targets": "Business"} -{"text": "What label best describes this news article?\nIT Myth 5: Most IT projects fail Do most IT projects fail? Some point to the number of giant consultancies such as IBM Global Services, Capgemini, and Sapient, who feed off bad experiences encountered by enterprises. Sapient is a company founded on the realization that IT projects are not successful, says Sapient CTO Ben Gaucherin.<|endoftext|>Business", "inputs": "What label best describes this news article?\nIT Myth 5: Most IT projects fail Do most IT projects fail? Some point to the number of giant consultancies such as IBM Global Services, Capgemini, and Sapient, who feed off bad experiences encountered by enterprises. Sapient is a company founded on the realization that IT projects are not successful, says Sapient CTO Ben Gaucherin.", "targets": "Business"} -{"text": "What label best describes this news article?\nBEA grabs CA exec to head product group BEA Systems Inc. has hired the Computer Associates International Inc. executive responsible for CA's Unicenter line of enterprise management software to head BEA's product development group.<|endoftext|>Business", "inputs": "What label best describes this news article?\nBEA grabs CA exec to head product group BEA Systems Inc. has hired the Computer Associates International Inc. executive responsible for CA's Unicenter line of enterprise management software to head BEA's product development group.", "targets": "Business"} -{"text": "What label best describes this news article?\nAutodesk tackles project collaboration Autodesk this week unwrapped an updated version of its hosted project collaboration service targeted at the construction and manufacturing industries. Autodesk Buzzsaw lets multiple, dispersed project participants -- including building owners, developers, architects, construction teams, and facility managers -- share and manage data throughout the life of a project, according to Autodesk officials.<|endoftext|>Business", "inputs": "What label best describes this news article?\nAutodesk tackles project collaboration Autodesk this week unwrapped an updated version of its hosted project collaboration service targeted at the construction and manufacturing industries. Autodesk Buzzsaw lets multiple, dispersed project participants -- including building owners, developers, architects, construction teams, and facility managers -- share and manage data throughout the life of a project, according to Autodesk officials.", "targets": "Business"} -{"text": "What label best describes this news article?\nU.K.'s NHS taps Gartner to help plan \\$9B IT overhaul LONDON -- The U.K.'s National Health Service (NHS) has tapped IT researcher Gartner Inc. to provide market intelligence services as the health organization forges ahead with a mammoth, 5 billion (\\$9.2 billion) project to upgrade its information technology infrastructure.<|endoftext|>Business", "inputs": "What label best describes this news article?\nU.K.'s NHS taps Gartner to help plan \\$9B IT overhaul LONDON -- The U.K.'s National Health Service (NHS) has tapped IT researcher Gartner Inc. to provide market intelligence services as the health organization forges ahead with a mammoth, 5 billion (\\$9.2 billion) project to upgrade its information technology infrastructure.", "targets": "Business"} -{"text": "What label best describes this news article?\nPlay Boys: Google IPO a Go Anyway Even though Google's two founders gave an interview to Playboy magazine in the midst of its IPO filing, the SEC allowed the company's offering to go ahead. The boys filed the interview with the SEC and corrected mistakes in it.<|endoftext|>Business", "inputs": "What label best describes this news article?\nPlay Boys: Google IPO a Go Anyway Even though Google's two founders gave an interview to Playboy magazine in the midst of its IPO filing, the SEC allowed the company's offering to go ahead. The boys filed the interview with the SEC and corrected mistakes in it.", "targets": "Business"} -{"text": "What label best describes this news article?\nMore Big Boobs in Playboy An interview with Google's co-founders due out in the current issue of Playboy may delay the company's IPO. Securities regulations restrict what executives can say while preparing to sell stock for the first time.<|endoftext|>Business", "inputs": "What label best describes this news article?\nMore Big Boobs in Playboy An interview with Google's co-founders due out in the current issue of Playboy may delay the company's IPO. Securities regulations restrict what executives can say while preparing to sell stock for the first time.", "targets": "Business"} -{"text": "What label best describes this news article?\nDutch Firm Beats Apple to Punch A music retailer from the Netherlands beats Apple by launching a download service in Europe's latest market battleground. Also: Movie industry wrests agreement from defunct company.... Microsoft challenges Photoshop hellip;. and more.<|endoftext|>Business", "inputs": "What label best describes this news article?\nDutch Firm Beats Apple to Punch A music retailer from the Netherlands beats Apple by launching a download service in Europe's latest market battleground. Also: Movie industry wrests agreement from defunct company.... Microsoft challenges Photoshop hellip;. and more.", "targets": "Business"} -{"text": "What label best describes this news article?\nHP to Buy Synstar Hewlett-Packard will pay \\$297 million for the British company. Also: TiVo goes all out to attract customers hellip;. Sprint offers service guarantees for business wireless subscribers hellip;. and more.<|endoftext|>Business", "inputs": "What label best describes this news article?\nHP to Buy Synstar Hewlett-Packard will pay \\$297 million for the British company. Also: TiVo goes all out to attract customers hellip;. Sprint offers service guarantees for business wireless subscribers hellip;. and more.", "targets": "Business"} -{"text": "What label best describes this news article?\nA Personal Operator From Verizon Verizon plans to offer a service that would act as a virtual switchboard operator, letting customers stay in touch at all times. The program would send phone calls, voicemails and e-mails wherever customers designate. By Elisa Batista.<|endoftext|>Business", "inputs": "What label best describes this news article?\nA Personal Operator From Verizon Verizon plans to offer a service that would act as a virtual switchboard operator, letting customers stay in touch at all times. The program would send phone calls, voicemails and e-mails wherever customers designate. By Elisa Batista.", "targets": "Business"} -{"text": "What label best describes this news article?\nPaid Search Growth May Slow A new Internet advertising forecast shows a slowdown in paid search listings in the next five years. Will the projection affect Google's prospects when it goes public?<|endoftext|>Business", "inputs": "What label best describes this news article?\nPaid Search Growth May Slow A new Internet advertising forecast shows a slowdown in paid search listings in the next five years. Will the projection affect Google's prospects when it goes public?", "targets": "Business"} -{"text": "What label best describes this news article?\nFark Sells Out. France Surrenders Blogs are the hottest thing on the Net, but are they messing with traditional publishing principles? One of the most popular, Fark.com, is allegedly selling links. Is it the wave of the future? By Daniel Terdiman.<|endoftext|>Business", "inputs": "What label best describes this news article?\nFark Sells Out. France Surrenders Blogs are the hottest thing on the Net, but are they messing with traditional publishing principles? One of the most popular, Fark.com, is allegedly selling links. Is it the wave of the future? By Daniel Terdiman.", "targets": "Business"} -{"text": "What label best describes this news article?\n'Madden,' 'ESPN' Football Score in Different Ways (Reuters) Reuters - Was absenteeism a little high\\on Tuesday among the guys at the office? EA Sports would like\\to think it was because \"Madden NFL 2005\" came out that day,\\and some fans of the football simulation are rabid enough to\\take a sick day to play it.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\n'Madden,' 'ESPN' Football Score in Different Ways (Reuters) Reuters - Was absenteeism a little high\\on Tuesday among the guys at the office? EA Sports would like\\to think it was because \"Madden NFL 2005\" came out that day,\\and some fans of the football simulation are rabid enough to\\take a sick day to play it.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nGroup to Propose New High-Speed Wireless Format (Reuters) Reuters - A group of technology companies\\including Texas Instruments Inc. (TXN.N), STMicroelectronics\\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\\will propose a new wireless networking standard up to 10 times\\the speed of the current generation.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nGroup to Propose New High-Speed Wireless Format (Reuters) Reuters - A group of technology companies\\including Texas Instruments Inc. (TXN.N), STMicroelectronics\\(STM.PA) and Broadcom Corp. (BRCM.O), on Thursday said they\\will propose a new wireless networking standard up to 10 times\\the speed of the current generation.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nAOL to Sell Cheap PCs to Minorities and Seniors (Reuters) Reuters - America Online on Thursday said it\\plans to sell a low-priced PC targeting low-income and minority\\households who agree to sign up for a year of dialup Internet\\service.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nAOL to Sell Cheap PCs to Minorities and Seniors (Reuters) Reuters - America Online on Thursday said it\\plans to sell a low-priced PC targeting low-income and minority\\households who agree to sign up for a year of dialup Internet\\service.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nCompanies Approve New High-Capacity Disc Format (Reuters) Reuters - A group of consumer electronics\\makers said on Wednesday they approved the format for a new\\generation of discs that can store five times the data of DVDs\\at the same cost -- enough to put a full season of \"The\\Sopranos\" on one disc.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nCompanies Approve New High-Capacity Disc Format (Reuters) Reuters - A group of consumer electronics\\makers said on Wednesday they approved the format for a new\\generation of discs that can store five times the data of DVDs\\at the same cost -- enough to put a full season of \"The\\Sopranos\" on one disc.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nMissing June Deals Slow to Return for Software Cos. (Reuters) Reuters - The mystery of what went wrong for the\\software industry in late June when sales stalled at more than\\20 brand-name companies is not even close to being solved\\although the third quarter is nearly halfway over.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nMissing June Deals Slow to Return for Software Cos. (Reuters) Reuters - The mystery of what went wrong for the\\software industry in late June when sales stalled at more than\\20 brand-name companies is not even close to being solved\\although the third quarter is nearly halfway over.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nHacker Cracks Apple's Streaming Technology (AP) AP - The Norwegian hacker famed for developing DVD encryption-cracking software has apparently struck again #151; this time breaking the locks on Apple Computer Inc.'s wireless music streaming technology.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nHacker Cracks Apple's Streaming Technology (AP) AP - The Norwegian hacker famed for developing DVD encryption-cracking software has apparently struck again #151; this time breaking the locks on Apple Computer Inc.'s wireless music streaming technology.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nEuropean Download Services Go Mobile (Reuters) Reuters - The ability to download complete\\tracks directly over cell-phone networks to mobile phones is\\becoming a reality in Europe.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nEuropean Download Services Go Mobile (Reuters) Reuters - The ability to download complete\\tracks directly over cell-phone networks to mobile phones is\\becoming a reality in Europe.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nOpen Source Apps Developer SugarCRM Releases Sugar.Sales 1.1 (TechWeb) TechWeb - News - August 13, 2004<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nOpen Source Apps Developer SugarCRM Releases Sugar.Sales 1.1 (TechWeb) TechWeb - News - August 13, 2004", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nOracle Sales Data Seen Being Released (Reuters) Reuters - Oracle Corp. sales documents\\detailing highly confidential information, such as which\\companies receive discounts on Oracle's business software\\products and the size of the discounts, are likely to be made\\public, a federal judge said on Friday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nOracle Sales Data Seen Being Released (Reuters) Reuters - Oracle Corp. sales documents\\detailing highly confidential information, such as which\\companies receive discounts on Oracle's business software\\products and the size of the discounts, are likely to be made\\public, a federal judge said on Friday.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nSun's Looking Glass Provides 3D View (PC World) PC World - Developers get early code for new operating system 'skin' still being crafted.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nSun's Looking Glass Provides 3D View (PC World) PC World - Developers get early code for new operating system 'skin' still being crafted.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nApple to open second Japanese retail store this month (MacCentral) MacCentral - Apple Computer Inc. will open its second Japanese retail store later this month in the western Japanese city of Osaka, it said Thursday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nApple to open second Japanese retail store this month (MacCentral) MacCentral - Apple Computer Inc. will open its second Japanese retail store later this month in the western Japanese city of Osaka, it said Thursday.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nCharley's Force Took Experts by Surprise (AP) AP - Hurricane Charley's 145-mph force took forecasters by surprise and showed just how shaky a science it still is to predict a storm's intensity #151; even with all the latest satellite and radar technology.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nCharley's Force Took Experts by Surprise (AP) AP - Hurricane Charley's 145-mph force took forecasters by surprise and showed just how shaky a science it still is to predict a storm's intensity #151; even with all the latest satellite and radar technology.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nScience, Politics Collide in Election Year (AP) AP - With more than 4,000 scientists, including 48 Nobel Prize winners, having signed a statement opposing the Bush administration's use of scientific advice, this election year is seeing a new development in the uneasy relationship between science and politics.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nScience, Politics Collide in Election Year (AP) AP - With more than 4,000 scientists, including 48 Nobel Prize winners, having signed a statement opposing the Bush administration's use of scientific advice, this election year is seeing a new development in the uneasy relationship between science and politics.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nBuilding Dedicated to Columbia Astronauts (AP) AP - A former dormitory converted to classrooms at the Pensacola Naval Air Station was dedicated Friday to two Columbia astronauts who were among the seven who died in the shuttle disaster Feb. 1, 2003.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nBuilding Dedicated to Columbia Astronauts (AP) AP - A former dormitory converted to classrooms at the Pensacola Naval Air Station was dedicated Friday to two Columbia astronauts who were among the seven who died in the shuttle disaster Feb. 1, 2003.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nRussian Cargo Craft Docks at Space Station (AP) AP - A Russian cargo ship docked with the international space station Saturday, bringing food, water, fuel and other items to the two-man Russian-American crew, a space official said.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nRussian Cargo Craft Docks at Space Station (AP) AP - A Russian cargo ship docked with the international space station Saturday, bringing food, water, fuel and other items to the two-man Russian-American crew, a space official said.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nBangkok's Canals Losing to Urban Sprawl (AP) AP - Along the banks of the canal, women in rowboats grill fish and sell fresh bananas. Families eat on floating pavilions, rocked gently by waves from passing boats.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nBangkok's Canals Losing to Urban Sprawl (AP) AP - Along the banks of the canal, women in rowboats grill fish and sell fresh bananas. Families eat on floating pavilions, rocked gently by waves from passing boats.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nT. Rex Had Teen Growth Spurt, Scientists Say (Reuters) Reuters - Tyrannosaurus Rex grew incredibly fast\\during a teenaged growth spurt that saw the dinosaur expand its\\bulk by six times, but the fearsome beasts \"lived fast and died\\young,\" researchers said on Wednesday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nT. Rex Had Teen Growth Spurt, Scientists Say (Reuters) Reuters - Tyrannosaurus Rex grew incredibly fast\\during a teenaged growth spurt that saw the dinosaur expand its\\bulk by six times, but the fearsome beasts \"lived fast and died\\young,\" researchers said on Wednesday.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nGene Blocker Turns Monkeys Into Workaholics - Study (Reuters) Reuters - Procrastinating monkeys were turned\\into workaholics using a gene treatment to block a key brain\\compound, U.S. researchers reported on Wednesday.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nGene Blocker Turns Monkeys Into Workaholics - Study (Reuters) Reuters - Procrastinating monkeys were turned\\into workaholics using a gene treatment to block a key brain\\compound, U.S. researchers reported on Wednesday.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nDolphins Too Have Born Socialites (Reuters) Reuters - Some people are born to be the life and\\soul of the party -- and so it seems are some dolphins.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nDolphins Too Have Born Socialites (Reuters) Reuters - Some people are born to be the life and\\soul of the party -- and so it seems are some dolphins.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nWhat's in a Name? Well, Matt Is Sexier Than Paul (Reuters) Reuters - As Shakespeare said, a rose by any other\\name would smell as sweet. Right?<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nWhat's in a Name? Well, Matt Is Sexier Than Paul (Reuters) Reuters - As Shakespeare said, a rose by any other\\name would smell as sweet. Right?", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nUK Scientists Allowed to Clone Human Embryos (Reuters) Reuters - British scientists said on Wednesday\\they had received permission to clone human embryos for medical\\research, in what they believe to be the first such license to\\be granted in Europe.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nUK Scientists Allowed to Clone Human Embryos (Reuters) Reuters - British scientists said on Wednesday\\they had received permission to clone human embryos for medical\\research, in what they believe to be the first such license to\\be granted in Europe.", "targets": "Science and technology"} -{"text": "What label best describes this news article?\nRussian Alien Spaceship Claims Raise Eyebrows, Skepticism (SPACE.com) SPACE.com - An expedition of Russian researchers claims to have found evidence that an \\ alien spaceship had something to do with a huge explosion over Siberia in 1908. \\ Experts in asteroids and comets have long said the massive blast was caused \\ by a space rock.<|endoftext|>Science and technology", "inputs": "What label best describes this news article?\nRussian Alien Spaceship Claims Raise Eyebrows, Skepticism (SPACE.com) SPACE.com - An expedition of Russian researchers claims to have found evidence that an \\ alien spaceship had something to do with a huge explosion over Siberia in 1908. \\ Experts in asteroids and comets have long said the massive blast was caused \\ by a space rock.", "targets": "Science and technology"} diff --git a/tests/data/t0/ag_news_prompt_inputs_document.bin b/tests/data/t0/ag_news_prompt_inputs_document.bin deleted file mode 100644 index b786d6e414d6c5784520a20da2f254eb9c3e7add..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12038 zcmd6Nd3=m#*Z*~{dzs~)d-j=RlF3Xm*^x~WqLWMrNsT0yh}aSlNr;pXJ3&xEP*klg znxd9c<)N*mDAigXEhV}T+P<}vwogl|8|6J$^7hsD_e_4D&->r|htJ5&eP8EX=Q_*x ze9t8-{rvCoK_NR~ySPH_ARmy~{gqTE8ELB~g`1;#ir`k4Ls5c~d>3q{10yx0zkL)) zWU1+loCh!Oc9V3S19o=OkbPV(8Ocec4gw9|>+a(ky-&#hw)(+{_7DzN#_`2R;KYE- z+n%ibvpd*ySGQzwIK-PI;wGz%jR4>d@j|6ImeY_qoz38a2T^YFl(yH~SItfkVUJcP zNu&{8PS6|OWEK}OE*)Our&9<1Hc8=IgQ}e_Nb9~lQb~W)h-BHsCmPHY4v7g*?KkYR zEbi_mTpJM%YhAcfL$(*xadj|FvV)u4nq5srIHaGX2oMBeA}_+C04C2(K2M)dFF-Fi zu-mufn^SddmvI-`y(etYZhJYPV_uE_5i$J3}v8+~oIYwM`;6_)$71nrJ#?>U>MB+PUA5cEk=>K|`KZZ5Abx zT3k<0JZA^3a9CRo(cP4!(PgH|@E-II{+jzy+Ha6Z!JuA32ZIaV=Ozl<#!F<40-&B= zmQFmE9Wt1IPEo~)`2Jqw*g)S>f9HSGtT5MUBr=)%u;&8*RWL_I2e9hc)HGuyBStjKWVgFN-Ni0$N^t{a#=DLsao$&35TYlxRR#K7;vYD1Pu(SJ6lLwpq z%f(3weR7C-iKYSmlB_2(v5Id>%-g2Rf?)seU;Ib~O8Z@RxV=O~c&^VGtKMv)iCW+G zu)kWXs^>n=tfu3l%VDZCU0AB!ZFqt3mOrAvEOJJVrf&9oV7%^+!eYs}l3%<2T-7``yhhd7sgcix~ z4q%^VdSAIO8Q{e*8LcT3CpA3D? zu`~BESR!Qf5{cA6kxn92Fud&m{u|RolvlYg2EG@zKm|Roh^FKD(WtMltn%+UtBEX3 z_+xa7$N9=zO$7A+o4ob^W{v;3{@*i46LDdF`1j1wMD9C%XO1Q^Soi>` zd#~;dPzf^eBR7R6@>Ay1w16*%E@a6>C2>H)QxdtZ%R@PzJ598SWa3$jwq?P7!x`#T zKJfasygOAxsx`GzxN}leo#usM8WKm|q5sNkBFErbl zhmPa99W%c!%@R9B?4Ix$+6drb)E4*@ZPp&+!jK-<)wdp@WM`r}D;#TEI$TJc9sbPR z|LGsKu+2qacH7h4nVcqaoE8W(4FH?Pzt(QjIAbw4sjYGo8EFjbnvULoSeplDgIuth z&QsfA2EMmc(-rq>RfsTAcRI+oE(zol?TdOF^ssh>4A&iQAWX-9x9IPaB1_w8KrFWy=qqDG^HPUaH})LlB># z_6Vc#L?fIJVUeR4EUoy^REgRChVWAlX8&5OYV7Yu@OZa0`^RQ+k<1Z-x!#X#2xGNr zojfthKWl+XBHb0tgQm+HExz}5e$X*$8%B$5m5oc^=BouP?P!adZr!EXVc z9r+e!%s!Sa5Tn zSA;YhU?p%ul%ESO`rk14UO`;*yDJ^O^RMl6M0H64aWiyvP`BO=k5QtTG~h~~speU( z={Z>LSlgKf(PGIl(UYB;9Z?8O*4j%<=7ymVBDdz8@*dw^QS9zR< zbnUFtkS@i?A|-MOec-Ncc7X_MRc75+P=i(A5n8HisPA>#k-m}^UZmgezN@=wtJDot zFk~eseZRX`dzQ_H>rO;h1v(&L-)rcdg7CCKt9`4R3oa>ap2}9RYn`GS%x(WL{=VN9 zaF!0pz2p$_lSNIf(+=|lFZ`)EP6axesuc`%^m1;2e_@6y*xNOTU*m+Ct`~EBYdfX$ z4#Qnthc|xltA$zOA(aS2O*;%~;|(P{vG_L2tsbz$9>r=}Fvbg;w1Zr}*O|Fn5AV%! zVz;q8E~a#Vv3N8)p}es7=G?M+VW0uK2hA5^i7P-+V@3_ZWwFEZxQ|gLvkA*^ZDR`&9WV?8EoLz48^Ks#7h2CCO`q^;UA@C=&_T~4lK`) z2v@(pzm>SkUPd%qZSP6UWAvEQ(xcK~oA8{(BI@mNd2lZ?YK#i}QPO?=U(b}oX9sc+ zS)<XMKw*C0W|Ine4a%IrzsYxxD;HU#HU{5 zwu|-n8Liwd`rchIF%7)kBM+ucTNh~>c(3nP?IUo4Lo<(e!9n|HLLA2Pul78s(61Kv zM@ZzQz+1{KP(3Jz)F@rx4LC?$T03ZF;?qzydkl76E{Ib9j5U1@R^x9=R*Tn^#cncJ50l7ntd7_)GiQqmtRb^>`7(e$?JR^VAK`E1x;~zX zsP*SjZNQ`w-@)JDS(h zCfe+fjB)R^u=RAU#P@XTJ?<3tZN1hw;4Sqrt%5s?9r^$@=Ajsgl=wG5c>g7~%3kHL zGSL6oj?l?=87r{6Fn}yo$)vxK%GKZr#AGrrY(x)~wC6f9e=kFu)-_|j0qT!oTn5|8 z+88fP9LTbpKtr#JQ}CQdd!*(_>gT|24F1WpomA?sV$B^-N9(d6A0}el=3qDcDSG0MZr|}! z^u{c48yI1kVl6&3HPD85KMj5nvdQn68sgGi#VGZFT)j-OOVJ70jVfUR^!{<2jlplxj;tAB8h?aYpfebG7^o8J*(h zN`C1bR?nQUo*!y27cGc0!%A4y^u7P2(4qDgy#mOKCO_@Z(^=l$$@L)rcXSL!%e>?n z63D8zpwEL3W8Z}|SncHb{cRp)k43IPgk6f0$RJMR9#apP zDZ`Ctuv76Mw{roCVTa+Eb~G)uzYR)54=tr-iUDsi-l9t(=NydQJo^BX zSx-;uf{hORRwDQOT(nDJ9yGLFOdljWVX$p5qJ0P<#a53#si91j?*|q=t<%~B<88dd2=iHETgEW zygTrpPn`?A!_D|b*-v8k?t1HaFLT;(TdSKs@nQ$*)p##KDL6(*|hD*;}! z4oAO=3dw`rnO?qAln1-h5+TLRy0#$fj>pnPu`c|-m(N0MmA(=4c$9{eguEJ|#ePrwTKznHj_8XSd-*Co&ch8Z$Zs@CWNbRW<*iaNvrtoDr(x*C1G|2sd-D#EuM;aivS=>)X)!;AQ@Ka9UpneQa6n zSj9-Qm%>yhtA5Nsn#i`0vB5)Bok>uL2y?YnI(tw9OjICZYQU#yFS(jqpz0brnwB9# z$)+D-H^$y-S@lmE(af@5hd*gP=&;%MrOmvkXH*WzkpdGXGSu?E$wp#h)8W$zYl}Nz zEoDCYSeiwdMn)y8wR{qb^DKA=J;Ym42aVC4c_)nLb}D|#lrSz?*LIs{nD#7O;-XS8 zegje|tLlsVFTxV6)B?U;HV?oWy;696>=Z>h{+1~oRI^?;3+91#IA~gG<45`T%wK3d z$m;+MLabs+tVu`K)7lF$QMKW*q#-L6XY%5y@11SV*=&6vvn;b#^+nJ(e1<+yTbSJw zy`YDc{UYNp4Z=D@6S+`egvh`M^uXf^GseLNtZ^djM`ThX#bV_>5^BZS5Sy9sb~C*S zk%+}utgSO-K?HRO#gYSJ!I))5>-)0tK<&%I=RGQPf50NK7P8g33L8mvC{jmYK1m9= zp&G^2V>Qj@YGHQ3ApR%S1}GArw}hFnALIsicEHPc68=M*V)2AxlO6M!KNd%1$i_-%xQYM(h(`GIg~H=-HXXjre<9oJt+- z(2!*?XV^v9Vs=4qE^@-noXLCx_6oD(*IATWkuYoYk5M27`n=?_m@Kor8Gya?j(LOE z_}=SS=b^SQCx1fMqkgmXwJs}!npZ^U5vJ`CSNBl+bjMb z>vSIUHmmMj{Oy|cZX06g`r-9T9p49f=XdSLH27!zbmuT}Ah*c>3U7y%$Y-Qs@5z4I zhaD)u?HWF5iXu&%n}KtT%q-0ReR)lqO>|MVhTnEPJN@JdF6{oULOQ;6o?E24>@$mf z(mvJU-Tbk0zr7ag=>~bh@l_VK<9mFnBR6|jQ6ns+b#bYf-LEv)i^41xIbQU~Vkn_C zspU(Zkdm>K?!-x3+`29=hDD4C#hSFh1!cl!bt-mODH+TbgJC!FDotcox=gYKBi2=% z2BCkSlw1%bQ2HCSQqoHGla+PN8qG3#Aq}~(G`oW^)?*{Brwxb}j z@bL0MAfXKw7;$Q#8AAIanre~Zf?|E<8zS<#(Oe}z3MwgPJpbqTA9xlcuPQ_l5(dz{rx2}-4HP3 z9-<)^EM7dDXMO1_+Ef~?OtYvX9f%JmmHh|WSE!KF;P;E$gt=kCbZ^d68j*kxQH1;fUcZcJ3Q^_EsKd(p{=;tsuemofGvv$hfRRaaH^nGy8uN z*$IMh(Y^~l6KtUJUx9UJHN6$fo*1l*cCv{00a4;VLanig3Y*`lPkUo~uIn0Na92M3meUC%r$I79yYOf&&3dV@iYpY8N;I3WQ4SUol>YIfU`vE$eAo zLCHxtFKI*A#6|J+*YlRc9{7>fTz-zgIum~qLdM(8-c{gtphT!`p0Kn$9j8jxoi>(3 z4DmQ|E)8Gz$5Tmad7s(P4rR(oik=3SnlZ_bh3z^oEY@7svC8+|PKnH+Z%XAji)S)$ zVng190e zX3tqvCwrKkb2qJsW7Y3=be0Cu8RE_wKRfbMmoZz^LY1u_)=$Q~vnMcqvsXttF;n@L z9%oK8?ABaEJac7-oo8<`*$)r{xWh)l0p+h4u_Ejh{P2XY?6^gH;o3}M?k1TW8u+Vi zseUf)5`Pw!aF@8cXgd^X59wKSeMo0vj`me)laJ4@F|IGe+HiVE8uVAY^4X1p+Jsj! z%k&LU-T~k%{a~dNen1R0U7n=4y8!Qm7;J>|=%l?-c_hfN1->|nI`H8Zx9zLyCiKSM zenB(yvp&I#0Z4L4=Gf<;iSS2LTU0R}1BFe~FKNh|bUCq2z^6=v+ zr}gqInH*7T)u%%D*uIVTz5h6}QJmYhyvJ#=sz(~uwGoD~A^B~ur~twjIiW&+T~d!^ zCmkTLxr0mq`JjfLv(?NSZX{MBIvp*&!29l`!pSOyBK)C0-@Oj#@hFUUH`&u0r;@e;#3?7yZoV}=nrH_DUn`&DMC@*0 zU`%mXk0)9(R@?zsrR8e8krO;rw`T{-K;@XgeIJWYr$eIFgw@YYhDEQ>Vl$=N@h5sL)G&;k#mx}i41o-S=jw#XqPEw zV3$;Y)}FQ)ZH*Q<&qd-anw{hY^pg~!0IOvcoW~utCo3RKy%B(kJw)K)YP+L1e&ld$985b1&e%BS#kzI^$%_1E(ZsPIW68FOvKa?XUaEP=<`x zgl&PLPi`L7!jgi4gLhPU!R2=*{t7a$7yLR{&LUr0Vuhglcmu;vxIZKSYy2$w04JKs z+38TG5A4u#kKuNFe|2=btF2UDZNWED{{DA_=i>gpXLgI=D}9o}P5#*XuBMvKP;1B| zqv)_hM{`uT2d!`}tRr@eRXUdUef!_tb}u?TqwGZs`}{Mk9Z5X0u#Zg6yv}A(dbID* z%=`{YvN1frgPHD^%yfcu3-Rqxi!V>Udw5URoqpoH6Kzw*KGE`WcNRAl`cjjdtacBG zIj4KW9gS-+bmzqLD46#S5IMgr9WHmz!`y#Xi}wM?vySr~cIP7Gm$4g&g?LJdu%2_1 zYGexl)~nw!F)wUwCc4;@2PAD~-0io4hLT5kN6KLxqg0Zq`Y}o~t*Bc>$0+9Ew7#C+ zHK*{$=-a(S@&%XAjm+|GrSCj!c6Xc~YTE3dp=G>wIb?TybFz}~`Zp(=Y0Ejv|PoY zIe4-9Epef^V0Z)c$C>+9?NXe^C>8>o)aJt%9Gi81s|WZQS(Zz2e|a9y6=UsV_ukI5 zb-{$B*1WCPNQo@(Hq3DYQOcr@3;dtx5UgZTRES-ghg*vCkSbxF@!Q^hp2s*VdRtt8 zH(>gz+r*17-_jHBx=1($v|7Kx=XN5OGMswwCMnAmyHw03`SU+hKjKbeA9NA#&H3`E zVddu*EXV9v6xN|MKJ$qdX~-+T*uCzB-`gP$tYtMOe{~PYvF7!es2vkF5eC?C#*Rp@ zDC}azLkOj;7HPGZ6SYh($(s^ZppnT?WH$C&eM>G(FK?KgRsp|hYM_^KOxWLLXUJyH z9bGVdDi`Bw8#G@ka}Dfn)lokcGQrakP0vM6p*x*{eSnBsXh=Ahfmu}!DnLsBe5qvj zJAKPNzk{EV2g}4tZWX+X(e3T@l#rA7M7pA@2=Du|xOj7%L=Fm0m}0%N_rnQx^noJn zNaMPNGPzXttO0LW#ectN5In#=Dwn0X8=zmr76`PNy7-oxh!yQ{=GwOV`Pe7*5+us!r%vV?1{ui{cvK zuyO|4;R@>c&ED7gRl)Y++t~9i#@Mc37)@^&TPlceU5>1M6s~}mrZ8qMs^{~Tueg_U z%=B3N6K$%eC0nQYc`HI`K5BES*9OX-`4`re#E1jvFXs_|ol(T2v`dz$=&OI-$+}@| zuVj_Wx7<*&sdH6F5`5bGjAhT??nqx!uf$t2F6@!n>~|kBNN$ox!kX*)2xWC zakezM>pIb~&I>=`ZQZ+b83Q{Pk<8o8d6xAV6Sy4ge%vHW7#h@i^(ag~)9sT(!MuK1 zV!bE-hS)7cB7DwiI2>CJ=kz^e81ht-y64%ancDdfoLXkcH;jETHn!sJn+#PJ-Z))^ zcjS^>pKgetClTGTnGL&PrfpYE+irBiSLD&L`6ivry>ZrB0ToR1zE)vG!$wqh_SFV#1La)VXTa?rm&ze zR1}8V%2-ekVyG+(F%-ta`R{x$F;4RHotb+%@7#O-JEx|qA?;4$Y# z*E!c2)Z)D0s&_qbop7CZb-GkfyBb|(sMeWsmAXn?M_l_|M_t7()iU>Lta>3e{a=Nv z$aUP6`Q0ub54p5nt)tS_=DO;ta%m2++_l%G-^)Et^-KTea7U{ainosN1 zHO-lcn+)dQKvoRCz{X{fe}g^CBcFwnD>t0JF(6{{nkhPm01KZZYH`ZPdr%cUZA5>M!6NtX?1Wmv8~5Hbi|4&coJ?QGWxM;OM5P|Ay~3M}B`xFts%} z3G=s+!@lj2&%lBmk-vhAuq-F)kKh!{+Zpwza2i@})SKZr%-t3BMffZ)@{!%aKX7bM zA9-qD@GWdCh}}ZDu&-fP!=8p64f`2(Gi+wq%&?hZFT+-boecXJcCr8LVc5X1fwBF@_9s0} zkIS|j+ieBB1Gd-LR%1JpzNcH*#-t}IXWNpVq@3+Zdb4u2Bk9x1dF@GmQm!7QP>)vV zOi+{u>?BmbQ&a}*6s&^Pum+xn>YWPpM1}gHLg$V`=Ypa(`5R}JhIQoiFbx}EBW!}r zum!flHrNh3U?;o;)$0{k0?C=7VHf!|*bRGNFYJT;Q0I=~dcX$Z4VXCxZjs-Hci>$( H1c%{YQkiNZ diff --git a/tests/data/t0/ag_news_prompt_text_document.bin b/tests/data/t0/ag_news_prompt_text_document.bin deleted file mode 100644 index 60646247e5037a6b277473adb47a2864b90408dd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12526 zcmd6Nd3=m#*Z*~{dzs~)d-j=RlF3Xm*^x~WqLWMrNsT0yh}aSlNr;pXJ3&xEP*jy_ z(G<0mDi3WfMXA>EXerT!(DtpRbfwje@;g`ZdiDK1&*b_1{`>vmGct4E*E!d@&hkCq zbID3S|2uq8$adH&u24J32V{1CC6!4=+R91c=BS<`xYgxQl%OQv1)J%>NDb+49|aOw zYC0w7!OJ_{Bwgo#?VU7a50^_uauTV7K*RUCySPU0W3LbGeN6rr3Hu}3LpWR-#}^-h zV*@U3dA$BF?qJhx-IB%O5O0!*o2)c80)RWj3zgznPDAE&HiHZ9MY+in+Foy8HakIt zU0R(akw$nqL2q=ESzN@pba;)QP96B$B!zD+sV6a*O;p=A;i{tqF2)Ia zg1hOCQa9OFyIBixu0v7YDSF=GCVxb$Z4$A;PtqCDMAIQt=UZ~s_PvI*Lw2|f8uGMi zqbQNo;(B`Q89Qi&gW7V4?xrM-E;CJr_n>$1*W6FiUV}sm2K5p;7+ml^H&NI!ULr3j z0P5*Q>DV*bA%pp66jhvv@9#E_4fHMf5B_(}3Ui%CB9plfdoJ)_33J59duHcQXwVuL zsM6tCjhiS9`@@e4Pu6*H_rqx^?rpE(^5R^viTDZW>HzYV>?WN_PN+oI#!Te^YIP0S zn;=zN?{Sk*@;03<%cLA4?0-ozi6u&(-ng;CT=!AF6TTgB!>=4xOG>g%Hq+7uc9tJ% z@?fKXxj0FoPYy9J(KNs#>3Sj)tN5nGJaD=!2=@Q^#g9~=#6NTg+e<`*XZoD7>dhva zsP%0Z`>VC8dhX-QYC0~u9HvUsg{9iPMwu8mCD96x|1#5_6&3n|X5_UvxZ{89Q>m1QhePXmj#69$Dl}tWeZ|xO9|IR?hwb`Q`saonoQr;A zI;wkF?96=(mIxU=Mj|y(q?1S$3~xJt|Hd>C-=k8 zYa$C1{v6%nf&VVIH4)JNzvj08JFEO}8Gp|#O~i%y<6kpN6S?dB_nD=M4A%XpyizA^ ziT1**ckQreP_jltel2Wl(?n8|g1ZL&cdVJ|tanetIViNS zHJqYeM1CQBZFzJaDIqOS{v@gR zZ?th-fU=2n(VjKg!C$+vIX_Ee*1IPZO=JkL6)YRF$OqEG&Mg1!z}WZY2k*q>Zn&}d zHH{rUDeh-)BIDq@Y%O`F!(&gVh2#`39Fv;JY&ln*N6zS0rYzAv$Jh9c6;i1k{kw@6 zN4~_dJarV$@QC?!X_nY2V&{a<&{hEVqc*{(XuI|pD~9yAuD*2%CEF9#S>ad@)8Sm= z?C_`N{%?P&g)J@uv)i8R&SW)_qqIPnX#m(LK3cy?ZBc}B(I%D%R<-Z7!`J@V(F@^b4S7q@5aaM#ARS-e z9k4>@L5u#_T(NHpB78|(o{&-s#75X7PQcv0QPUXseYEej+v78hmJxmuIUmSSMZ31< zx(LAo4q=Ph3+@z_rCFs`ald2Ux&$E(Cmd5~$QM~GuWyD^q8+aDEUSL-REbbZcT){_ z6@vH-wMQ6@Cmi9t4~rbdU}?p#rb_HAt_eT)VCPYb)sX$&2p;dIW}n$8E|NJyur~OS zbz!WYt&t~2`DZOqNu;}idDV1zy~Xz)&kj05Z9@{_U6mK&gG6$!ozs7Iw^K9`O1S&F zHux>T(<9%)?Aph&31am6(D)#hUz>=exXkr!rT(ypXcKU0+r4BBd4;P5OVN8$VkV1# zJYZfgk;~fG5Pem(Tim^x+Mq#QjK1}1S0`NB_GT;f-jkBk?x6OApouz4`=aP_S1FRrUi1Ks6dH-t$->Zl({NYN6@BC{!9a3FTK->&n9n`J0!$Xv8CJngS zXR3LYD|$|rNE2!)f?4B$(Y8+mML6-A3zS0={d}*!KVq+8822W|o4=+OB1H$h6xg4y zr!F|k|HUmaEz#WbvK*f)E>pC_%EqwNN=qY{c;9Odd19n#bX~PDL-(0#q$v-WZr5$> z?*=@1v+lO_0}j2YS$`*f^GUEBrfZpnSVr)@?mIEbDfj(YezEh(DGr!#$Jjt*A;Jr~ z*QIEUlQJKRo#2I1s75<8FHTpl3zNtZwN7B~`CglLoNuo@kPaUOZ54l*v=noHQS2d< z;8h-_AzeGGG^9)Mkw}SLK;O8nn_VEnYL!{{71UrAd4N{y8tQxPcBHSQg%|1fJ8$c* z+bVSf6%4V-N#F16)t+U+;kskdRe=r&*z+2CuOK{W&}!f6=7I|fo2RlBd|apK26Nlr zkH71;37n+^axXYU{AAHp>$Jl>!3%#Wj#7b+rfLO4GQFIe;0MfV1-rWj@h>=Grt8HV z-&#-Uyv=Z1*Wry{{c2&Bct9n>P}4Sp+IUULPBFgCa;pdIuuHLu7L4)22JIl1@3m&` z)WdsooY>_okBccCU@RWZPAxC&y*W3mUKnV=jz#l@SmFv$)R<9Ia9QlIJnmzZ%k0DQ zq3^Y`mmLqc0}&ozp89&;ef#n7@q9jbV54k@D+U|5HAAt&84(@)oC#2XdHg478G7_2 zvFfX4`ub^Cms!!}O>$*djb5vB-OSTprxXj2fc?f0TAt|LB== z_-tPeB6;+uz@s$7a)ov@6U~NQPGI`^-pf>rB);YMbU>Cd-@DYlvii!yuC_j-OocMYX*70$z8*s2I8zr>da*9&Xl;*bP562bb8 zV(B4#7Q5W^6>RU2q&677D`g0Fb?1k$Txx`Dlqpcll-;dl5SEi|q|(m`N9c~I?ee?2 zBh;-L&|4%=U|ssfRY6&Ni_(0%1EZD6WO1IVWk@6soc_rv8gf3U1+~Z|N;pOL^S_~I zCE`;ra$CiE{ESv^6@BkAn3x9M?2!l4rmcxI4ZPELv-SZv!J(PQyI{ZlGa(LR|2KOc zROnZUdm|+BQs52cCa4~iLu!;R@CNLsF0CCjGx2Gtnmq=)J{Lr(e?hD;2P^!yC9A|M z%3?U|H-s|38PG}{5zBb=+=W7Lmhg*f-93|x8zJ3aF zKvM47b~3qUbi&jWiQI?HkRYFs0J5zIQZ^~)nQQO~7j&n~)h>FNb~3EP+$?do97@z< z3e5u#B7z|_>%ZkO93sL9Lmm{y(pGY5h=)mKxLHSRnANjI1=gWix_lWxpLQC;l@IW@ za$O(KMAZEAsJ3EYP$lgP_voIH;g(G966V;x5Z5gqPbzM-6Z-RaR4y16Fco{rzObJ@ z&$Yt}v6=4BOy<9)lQWC4YqINI5GxLVFS*}fRZ>Yc(;DEQ(N3;s=G%^MQ)R?bgBvRtv0O9?Y z*eZLK!}EduPj`e)wu@NJ-Gu>UsY)jOg;cHvPa-Chd1WJdq@+F5k;On6+P1D4PYzIj z4C6A`ZdS*5Vd6lRy(RPv(H>&lub+e+Mdy$#IH#A$#>gbniJV0a=5m^bL`!+(6upZz zDXN+85$)PiBtZDLmk=R&+u3XwIY)iuqV;v*|5m7RqF-M;~bVJljpB?lRWx@pQB<3-Vzi#&ZsK<)5Ng z{^a%@Q$=sg61RX6mMK=_lT!n2*kh!@uR=EYBU3|Mn#&ly9+0b-DRwA2A?s2l4CYge zOVQ$9xb9FNX15Ea%+RH@?V%e|~R&jmXAt`jIy+yYG@}kL4`^$8eu@(c-N)nCx(!H2Q$LK>`c^8DU553T+(%f%@($I5DX_;ccTZ~ibQph<6 zqnFR#$K=-27Y-Q zx>inln4)iI8>xl0ZRaIics^LkP`VTTKz4SK2|HBF^YluD9qPGM`VdrzQkd1oxL+y9 zo=;1SHo|(#kN6_3+T{TA2H!h-f5D#d+p|x?fA!1hSoo9&6t5hmao7zciT{w82Os6l ztzf>4qMq{3z<)k*Ch!(F*Hft}6JP0NrK6O?&{Bu#X&4A5&WeoGEODOT@X0I$ci0Kf za$O1;vde>iAg9N2LmsSxu}@xxYFcFfnv;o5br~G^eHP>@KS4YD-cy|-I`<;3 zEYx+)jb;8iL*XE0nEl3CY-AvuhA3nd7`y2OE7v>Y59&*!vv7hdT{>UihSmo!^Us5m z`a0-i%WB7}PMW0#R54K23Yc)!YJA z*U-_l3=vN@{SZ4s_EyWPf6<6$#`r4yMe{+2WP~q`=0!cFazKt0m?)8B!z%dm+ZEHawIxC{NESiuj4T5%S} zW<$K)OfN$u;yV_LYYbTsK^;P|MjVUbu1+3H+{jifph zsUtA=Bn4bkjpFLDDra-GFgsun|Fdcx6p7DT!c5qYa)UcN;AK1!|DjE>cv3hsJC%Tc z(Rer0%A>eJKVR;mfrf{OM9*2A@O$W0>?m$Wx}e(1PA>`HP;o0p>=9owb+rlT?U}@l z_Awc0q40a>DhT$$SI$BeUbzSd?0kFpKn$Q6Q%JyyT*oEVDct zfIa=Td4pf@z2C9U18rYU{)DbY9cSxnT~-J+uZYegOyfhY?xFVSSS{Qhm=fTEu^iKq zc~>dipV)xt>rxNqZ!PbG{s?VL_R?}IY1pfIeF4Lym77QbtB_l@0eU+|ur**^yhPIQ z#9WwZwx;;j>n!MPR^6HSn>B0QHpJ-l!|Rngz7O=y@7j-P@z46{&SBy}Zjt{b-VVG;-J zZjtJu&rJ78`&5T_*NJTS6bN4@9?RP-0U4ijj)u~#ie33xYS%@3bR-= zdC@F?A^Nh_68R@OCZHOuIQH018m><+@1nDw-tHX!1}m_gmA zmmm)lveM!KZ}Se)#73<1tfKYB?u~J>iiBc65fYFKEki!*@B)dXWVnE7Et4I7R?770 ziX8x}!_S)bV=heAFbO{^@9>SH7m_m-$UL}kb!Hvkr+iOw8FWRioG0M4KkH{taJ%un zHk{M{GE6Qn5rSob-Wnzf1LacKjq{Flspb1nx_rDHzFpYd%@DtNGG>XLMmyYgN<`Ud z6qMK=;9g4t4=x`B653&b5ho&=A+#@|vlbaHDAs4bAtEOo%~kTFpps%X^nZr`k!Nx9 zsv;!vo8GPI-RPY|ZRC`0y!{!&uZkk&_q64I4st=wm_HGILqBx-7g;-|Pe7h7DR;Wl z-(Mor4FN;$AhL47;>ELh)(gL)O{LMwG~1f?4N!-+V&Ah)dX~5=jCtn8L0Q1ou0!M! zhF!>bo;~p-T#YK!S?ljBxK^^+zW88L*?*vYg$lV9ey_Mim>U*MuEa8iq7n9mixAjh z+PL?1?C)UXvj*g;kVCke7a2Af`M%j54jE1(Yp{-IZ{=a8-J#mj3c6g|IZ@w(jLRw% zSH_Pqv;Q}d?H~x}?K|Kz!3HY-6s)UZ}Xx#ZoMBqJf+Wn(xA@aa3*cY%grbHN^c7ZdXK&aF{iv2>& zFO0))Sx?gn0#Cx3P#eOgN{T1Hp0^x!!B4D4^K%5&0r`^fU6shIu zI90Olq_G@gh{uWZdHA|No={TD`^=VhC{s>S^fb8Cj0t}zY}I*TvF4(VRmtzROJojx zQ!2;12}}k~rO2B&=PpE!qie`28ZBIwt_Jw_y+7Mb>1HFxq}&Yvw}hO2`SMkc%?7T} z{RJZ>HV?2jPXbu9CwrK^b2qGrW0mq|be0AY9%9%UKRfbe7crC6LY1u_)><}2U{7FN zXRnHMVixnQL7X|!uv2pdanPl0cAmY-9%qU}(mJ)mb%_aU8ydE8fJPCh!n#<;c&>&VF=Y0zKo z%4at*Y7<_`EYmkYc?W>6^n;a7_!05fba|5E_5!>&W3UmcNLw z+_tZ(o6tLZ`vuVeJOLLn2EHZJj&}Gt=7{H*{V{9)6>`taM~;lRjvO>{`tmHyp-f(< zjE0S)F;6jmZ>sU6{_k=Y4r*fS3NuXzN)(`L?@DtMqaz&&>mW7|x7pjp{)z%pR(&FLm+jkl-*$*2>&3Zk%X^#@t9qni9UNg88)n1<`O zv>64vI^^LUCK2=_zZcT+b0kLoC8$H}I`!FN_}nl#{%J=`jT7N}`T`>ADX<35(tvo) zgDlNj|7*zWPDi#}1lAdea31@DL+Y81Yuc{FI>h&GMbeJUT)~*$J2fJ?7SY4)g!#Q) z6_4-8eCGOmjS;&Fham87#LOe)UN~b~j&>Iz4sUM_&)pN4fJ~GdXROmX#!G3qDqyQ> z&F4)(28zESnl8X9;3f-TB>fVruLytY&vvf^dNd02ft&2=jni0L0b-ovXg{WHuaU^S zN&oI9*WpAv82DQG1Sewm1OsD=!+Jc{mNEJcd~>-PZ!rZA)$Q8GGF3T@86SfY5JOVA zZ{-{9!^21QaT5jkSlJH0-Dgl@Y=lP`IL4%wLkPKMXWz%-)9H|?HDT3slVQ$=XShj|jool~C|`v){Y6-h zlOARXH#yYP0TENPu*1sGE>q0FPOJc}KW#DE9xZZ~i^TanJK=NaKPf^1R@*8#i#uyi zRzR3~Jpf4uBYnSl0_j`NbFY1?e(CwD9(D*1_QI*C_oLV;J-Q!Q3zwoD$bI=M@^J3X z?!u96I=mybTm7xNl2+)`C)8lx8;#n2!mlxC$N=qfwFglK&+WqMna@3kcefl}MCgoD zKo6XdoH^CaS<<&Yk>tl{f8Ec9GGyl_YzYi~e1oeNmJ|#eysgR$F27UpmykI==hwk< z8oAyQD+JxeTPuFT-5~*p7iQ6WIMq$gPKPpmV274FA2;LstE1apZm#6MrE3emiSqZq zB|H=NcN?%<7hma<6mIh8?sqlSbcR|(9vDT39Xgt$!aZn}GhrRETdmTu+R?h7-fs7z z(=*Cmw6M=V!`hL=Lks)J56#T)pd{bl)eoOi5k%Gk$RU+m7}twLXFa+6i=0WoKEZ@8mzErukWcwPnb-T@-#m!-qS?s-^C zPHXWF=6Kfm-a)27g#0pg^Rp07F%i~sZc>fx1;ASMJ0|9>Ei+=@c3kYq1Cll~?&jM- zL&*cYgXXY~Q7Xw){TQX0R@5z`V-)l7hC@BQZBF5j(6@Vu))JgrY+|!ldtfpRrJww!6JK-)s>m9p{7QRl!LD_*5lDF zKHvLT9bbm1quiVf9q7Zw4PZI&*dJSX#XQLFr6G}n7pvbA7m5ppH$Z>9tFT$S6sJ&% zg#gF3`S1nDX29R-0e(R?=tA5hPvyB{tgr08;ixQjQXnE#EUTB(i86zNjMd?TED^P zwj=K|oOm0e4gFvWUn_lFbg=p#kik;XL(WpbhHX#?JHi~oMtAh?Hn zRxV3(H$cCLO%P}^b!lCGB39CanXB9G?OWdC3E?m7c&-=zp8Plqz)IKtgPEuO-*euS z++^WKfE9j26c)KZ{Y#StNAR{nN!2m^D$EC6$fCFgIH;U~cDaQ5ezW_vepRrw_$K!0 zi!nFUFN~(wj4hR_b)62aeh{vJm!>ebF{5_sm4!Ek7vWvTB-f|wBIt2Mp=`FrZUAcA zRnxZXo$%FKnXo4wE_TA8EiQN#bo^$R#+i_f{L^?mb|T!=vELi;tuK|)&D_j4>8ghQb&NV~mA`r7_0J zSXlT65GyJRVu%H?FoqZl3k%OT=l2rhNxt{H=giEVxpTicucxP@xubc|EXyKt#~56P zN1RQrQ?8S!%X!+>;=1WN=4x>Dxm0g;wY$nuqw}z<%vI_-=-TZ%kG$2wfMTy3tqu5QMf z&)<)wTpH^>L#|<$p5=r~??r3OtW$I7oXak~2VE!X9qQf(T%9hhwc4fgbzJi`ARX6O zYu5YKG0mBYQ3l`OcvcMN;P9fz7vSXL$p664*^$>T34Vm>oXF?kjip=%2XiBT2dkGw z{v7^&z`8Y&=M)5I zU}|mT({OBEd z!QXIdd*tOif?wb>ncSnkFoC^+>0OZ*6a}BcJel6(0nFVK`6Kup-rF1XQhhKU^YG68 z$afS6-@;Ov+v72ugSShf{t4Ep33yDyVSNl9voKXg4ky$gJQiTTK5&l@uuhHQpWrhv z>~GlKu)Se-!{&y)4O<&FHf(Fy)c&)lVL!umhTROi88$QQW!THGlVKmjE`~h}TNrjQ z>|ofyuz|7tNsrU=3wZ}@d(z`nVw;U^HMTYBgF4Q3COuL)+m`faZvy`(fN#9n^ zHY9ysx%!+!{Y#-9rl<(maaakfU^T3PwNQOlQ5Q&hu7>sG>aB`~ Date: Thu, 30 Jun 2022 13:06:29 +0200 Subject: [PATCH 169/297] WIP --- megatron/data/data_samplers.py | 29 +- ...n_causal_mtf_dataset.py => mtf_dataset.py} | 348 ++++++++---------- tests/test_dataloaders.py | 81 +++- tests/test_packing_dataloader.py | 50 +-- tools/preprocess_data_many_cores.py | 3 + 5 files changed, 252 insertions(+), 259 deletions(-) rename megatron/data/{non_causal_mtf_dataset.py => mtf_dataset.py} (51%) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index af7a983c4..c3e305b2f 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -27,13 +27,21 @@ def pack_samples(items, max_seq_len=2049): """ Items: - [{'input_tokens': array([ 6, 7, 8, 3]), - 'target_tokens': array([4, 5])}, {'input_tokens'... + [ + { + 'input_tokens': array([6, 7]), + 'target_tokens': array([8]) + }, + { + 'input_tokens': array([3, 4]), + 'target_tokens': array([5]) + } + ] Output: - decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]] - decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]] - decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]] + decoder_target_tokens = [[6, 7, 8, 3, 4, 5, ]]: Concatenation of tokens followed with padding tokens. + decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]: Segment ids determine original documents. + decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]: `0` depicts inputs, `1` depicts target. """ decoder_target_tokens = [[]] @@ -106,8 +114,8 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) - elif args.dataloader_type == 'packed': - batch_sampler = MegatronPackedRandomSampler( + elif args.dataloader_type == 'decoder_packed': + batch_sampler = MegatronDecoderPackedText2TextRandomSampler( sequence_length=args.seq_length + 1, dataset=dataset, total_samples=len(dataset), @@ -236,9 +244,12 @@ def __iter__(self): batch = [] -class MegatronPackedRandomSampler(object): +class MegatronDecoderPackedText2TextRandomSampler(object): """ - To be used with pack_samples collate_fn + Converts a two stream dataset with `input_tokens` and `target_tokens` and creates a batch that should be greedily + packed to be passed onto the decoder model. + + To be used with `pack_samples` as collate_fn """ def __init__(self, sequence_length, dataset, total_samples, consumed_samples, micro_batch_size, data_parallel_rank, data_parallel_size): diff --git a/megatron/data/non_causal_mtf_dataset.py b/megatron/data/mtf_dataset.py similarity index 51% rename from megatron/data/non_causal_mtf_dataset.py rename to megatron/data/mtf_dataset.py index 129cf6816..8a4839bf2 100644 --- a/megatron/data/non_causal_mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -13,38 +13,45 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""GPT Non-Causal Multitask Finetune style dataset.""" +"""Multitask Finetune style dataset.""" import os import time -import random import numpy as np import torch -from megatron import mpu, print_rank_0, get_tokenizer +from megatron import mpu, print_rank_0 from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset -def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup): +def build_train_valid_test_datasets( + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seed, + skip_warmup +): """Build train, valid, and test datasets.""" # Single dataset. if len(data_prefix) == 1: - all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets(data_prefix[0], - data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup) + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets( + data_prefix=data_prefix[0], + data_impl=data_impl, + splits_string=splits_string, + train_valid_test_num_samples=train_valid_test_num_samples, + seed=seed, + skip_warmup=skip_warmup + ) # Blending dataset. else: - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) + output = get_datasets_weights_and_num_samples(data_prefix=data_prefix, train_valid_test_num_samples=train_valid_test_num_samples) prefixes, weights, datasets_train_valid_test_num_samples = output # Build individual datasets. @@ -53,9 +60,13 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, test_datasets = [] for i in range(len(prefixes)): train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - prefixes[i], data_impl, splits_string, - datasets_train_valid_test_num_samples[i], - seq_length, seed, skip_warmup) + data_prefix=prefixes[i], + data_impl=data_impl, + splits_string=splits_string, + train_valid_test_num_samples=datasets_train_valid_test_num_samples[i], + seed=seed, + skip_warmup=skip_warmup + ) if train_ds: train_datasets.append(train_ds) if valid_ds: @@ -75,7 +86,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, train_valid_test_num_samples, - seq_length, seed, skip_warmup, train_valid_test): + seed, skip_warmup, train_valid_test): ''' Build a single dataset group corresponding to Option 2 of data loading see arguments.py a dataset group is passed in the following form @@ -88,12 +99,16 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, # Single dataset. if len(paths) == 1: - dataset = _build_single_datasets(paths[0], - splits[0], - data_impl, - train_valid_test_num_samples, - seq_length, seed, skip_warmup, - dataset_group_name, train_valid_test) + dataset = _build_single_datasets( + data_prefix=paths[0], + range_string=splits[0], + data_impl=data_impl, + train_valid_test_num_samples=train_valid_test_num_samples, + seed=seed, + skip_warmup=skip_warmup, + dataset_group_name=dataset_group_name, + train_valid_test=train_valid_test + ) return dataset # Blending dataset. else: @@ -111,32 +126,46 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, # Build individual datasets. datasets = [] for i in range(len(prefixes)): - ds = _build_single_datasets(prefixes[i], - splits[i], - data_impl, - datasets_train_valid_test_num_samples[i], - seq_length, - seed, skip_warmup, - dataset_group_name, train_valid_test) + ds = _build_single_datasets( + data_prefix=prefixes[i], + range_string=splits[i], + data_impl=data_impl, + train_valid_test_num_samples=datasets_train_valid_test_num_samples[i], + seed=seed, + skip_warmup=skip_warmup, + dataset_group_name=dataset_group_name, + train_valid_test=train_valid_test + ) datasets.append(ds) all_datasets = BlendableDataset(datasets, weights) return all_datasets -def _build_single_datasets(data_prefix, range_string, data_impl, train_valid_test_num_samples, - seq_length, seed, skip_warmup, dataset_group_name, train_valid_test): +def _build_single_datasets( + data_prefix, + range_string, + data_impl, + train_valid_test_num_samples, + seed, + skip_warmup, + dataset_group_name, + train_valid_test +): """Build a single dataset""" assert train_valid_test in ["train","valid","test"] index = ["train","valid","test"].index(train_valid_test) - # Indexed dataset. - indexed_dataset = get_indexed_dataset_(data_prefix, - data_impl, - skip_warmup) + # Target indexed dataset. + target_indexed_dataset = get_indexed_dataset( + data_prefix=data_prefix, + is_input=False, + data_impl=data_impl, + skip_warmup=skip_warmup + ) - total_num_of_documents = indexed_dataset.sizes.shape[0] + total_num_of_documents = target_indexed_dataset.sizes.shape[0] # this corresponds to option2 for data loading on the form # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 # splits here is an array of size 2 [start_index, end_index] @@ -155,10 +184,15 @@ def build_dataset(name): if splits[1] > splits[0]: documents = np.arange(start=splits[0], stop=splits[1], step=1, dtype=np.int32) - dataset = NonCausalMTFDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) + dataset = MTFDataset( + name=name, + data_prefix=data_prefix, + data_impl=data_impl, + skip_warmup=skip_warmup, + documents=documents, + num_samples=train_valid_test_num_samples[index], + seed=seed + ) return dataset dataset = build_dataset(dataset_group_name) @@ -166,18 +200,20 @@ def build_dataset(name): return dataset -def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string, - train_valid_test_num_samples, - seq_length, seed, skip_warmup): +def _build_train_valid_test_datasets( + data_prefix, + data_impl, + splits_string, + train_valid_test_num_samples, + seed, + skip_warmup +): """Build train, valid, and test datasets.""" + # Target indexed dataset. + target_indexed_dataset = get_indexed_dataset(data_prefix, is_input=False, data_impl=data_impl, skip_warmup=skip_warmup) - # Indexed dataset. - indexed_dataset = {} - for field in data_prefix: - indexed_dataset[field] = get_indexed_dataset_(data_prefix[field], data_impl, skip_warmup) - - total_num_of_documents = indexed_dataset[field].sizes.shape[0] + total_num_of_documents = target_indexed_dataset.sizes.shape[0] # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] splits = get_train_valid_test_split_(splits_string, total_num_of_documents) # Print stats about the splits. @@ -197,10 +233,15 @@ def build_dataset(index, name): if splits[index + 1] > splits[index]: documents = np.arange(start=splits[index], stop=splits[index + 1], step=1, dtype=np.int32) - dataset = NonCausalMTFDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) + dataset = MTFDataset( + name=name, + data_prefix=data_prefix, + data_impl=data_impl, + skip_warmup=skip_warmup, + documents=documents, + num_samples=train_valid_test_num_samples[index], + seed=seed + ) return dataset train_dataset = build_dataset(0, 'train') @@ -210,51 +251,42 @@ def build_dataset(index, name): return (train_dataset, valid_dataset, test_dataset) -def get_indexed_dataset_(path, data_impl, skip_warmup): - """Build indexed dataset.""" - print_rank_0(' > building dataset index ...') - start_time = time.time() - indexed_dataset = make_indexed_dataset(path, - data_impl, - skip_warmup) - print_rank_0(' > finished creating indexed dataset in {:4f} ' - 'seconds'.format(time.time() - start_time)) - print_rank_0(' number of documents: {}'.format( - indexed_dataset.sizes.shape[0])) - - return indexed_dataset - - -class NonCausalMTFDataset(torch.utils.data.Dataset): +class MTFDataset(torch.utils.data.Dataset): def __init__( self, name, data_prefix, + data_impl, + skip_warmup, documents, - indexed_dataset, num_samples, - seq_length, seed, impossible_token=-100, - ): + ): # Params to store. self.name = name - self.seq_length = seq_length self.impossible_token = impossible_token # Dataset. - self.indexed_dataset = indexed_dataset + self.input_indexed_dataset = get_indexed_dataset(data_prefix, is_input=True, data_impl=data_impl, skip_warmup=skip_warmup) + self.target_indexed_dataset = get_indexed_dataset(data_prefix, is_input=False, data_impl=data_impl, skip_warmup=skip_warmup) # Checks assert np.min(documents) >= 0 - assert np.max(documents) < indexed_dataset['input_tokens'].sizes.shape[0] + assert np.max(documents) < self.input_indexed_dataset.sizes.shape[0] + assert np.max(documents) < self.target_indexed_dataset.sizes.shape[0] + assert self.input_indexed_dataset.sizes.shape[0] == self.target_indexed_dataset.sizes.shape[0] # Build index mappings. self.doc_idx, self.shuffle_idx = _build_index_mappings( - self.name, data_prefix['input_tokens'], documents, self.indexed_dataset['input_tokens'].sizes, - num_samples, seq_length, seed) + name=self.name, + data_prefix=data_prefix, + documents=documents, + num_samples=num_samples, + seed=seed + ) def __len__(self): # -1 is due to data structure used to retieve the index: @@ -265,24 +297,26 @@ def __len__(self): def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] - input_tokens = self.indexed_dataset['input_tokens'].get(self.doc_idx[idx]) - target_tokens = self.indexed_dataset['target_tokens'].get(self.doc_idx[idx]) + input_tokens = self.input_indexed_dataset.get(self.doc_idx[idx]) + target_tokens = self.target_indexed_dataset.get(self.doc_idx[idx]) return { 'input_tokens': np.array(input_tokens, dtype=np.int64), 'target_tokens': np.array(target_tokens, dtype=np.int64), - } + } -def _build_index_mappings(name, data_prefix, documents, sizes, - num_samples, seq_length, seed, cutoff_last_epoch=0.95): +def _build_index_mappings( + name, + data_prefix, + documents, + num_samples, + seed, +): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. shuffle-idx: maps an index into a random index into sample-idx. """ - # Number of tokens in each epoch and number of required epochs. - tokens_per_epoch = _num_tokens(documents, sizes) - num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) # rng state np_rng = np.random.RandomState(seed=seed) @@ -290,87 +324,25 @@ def _build_index_mappings(name, data_prefix, documents, sizes, _filename = data_prefix _filename += '_{}_indexmap'.format(name) _filename += '_{}ns'.format(num_samples) - _filename += '_{}sl'.format(seq_length) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_doc_idx.npy' - sample_idx_filename = _filename + '_sample_idx.npy' shuffle_idx_filename = _filename + '_shuffle_idx.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0: if (not os.path.isfile(doc_idx_filename)) or \ - (not os.path.isfile(sample_idx_filename)) or \ (not os.path.isfile(shuffle_idx_filename)): print_rank_0(' > WARNING: could not find index map files, building ' 'the indices on rank 0 ...') - # For the last epoch, decide whether include the entire epoch - # in the global shuffle or not. - - # If we need only one epoch, then separating last epoch does - # not mean anything. - if num_epochs == 1: - separate_last_epoch = False - print(' > only one epoch required, setting ' - 'separate_last_epoch to False', flush=True) - - else: - # Get the number of samples for the last epoch - num_samples_from_epochs_minus_one = ( - (num_epochs - 1) * tokens_per_epoch - 1) // seq_length - last_epoch_num_samples = num_samples - \ - num_samples_from_epochs_minus_one - assert last_epoch_num_samples >= 0, \ - f'last epoch number of samples {last_epoch_num_samples} should be non-negative.' - num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length - assert last_epoch_num_samples <= num_samples_per_epoch, \ - f'last epoch number of samples {last_epoch_num_samples} exceeded max value {num_samples_per_epoch}.' - # If we have less than cutoff_last_epoch * samples_per_epoch of the samples for the last epoch, - # seperate out the epoch and treat it differently. - separate_last_epoch = (last_epoch_num_samples < - int(cutoff_last_epoch * num_samples_per_epoch)) - if separate_last_epoch: - string = ' > last epoch number of samples ({}) is smaller '\ - 'than {}% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to True' - else: - string = ' > last epoch number of samples ({}) is larger '\ - 'than {}% of number of samples per epoch ({}), '\ - 'setting separate_last_epoch to False' - print(string.format(last_epoch_num_samples, cutoff_last_epoch * 100, - num_samples_per_epoch), flush=True) - # doc-idx. start_time = time.time() - doc_idx = _build_doc_idx(documents, num_epochs, np_rng, - separate_last_epoch) + doc_idx = _build_doc_idx(documents, np_rng) np.save(doc_idx_filename, doc_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) - # # sample-idx. - # start_time = time.time() - # # Use C++ implementation for speed. - # # First compile and then import. - # from megatron.data import helpers - # assert doc_idx.dtype == np.int32 - # assert sizes.dtype == np.int32 - # sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, - # num_epochs, tokens_per_epoch) - - # np.save(sample_idx_filename, sample_idx, allow_pickle=True) - # print_rank_0(' > elasped time to build and save sample-idx mapping ' - # '(seconds): {:4f}'.format(time.time() - start_time)) - # # shuffle-idx. - # start_time = time.time() - # # -1 is due to data structure used to retieve the index: - # # sample i --> [sample_idx[i], sample_idx[i+1]) - if separate_last_epoch: - num_samples_ = num_samples_from_epochs_minus_one - else: - num_samples_ = doc_idx.shape[0] - 1 - - shuffle_idx = _build_shuffle_idx(num_samples_, doc_idx.shape[0] - 1, np_rng) + shuffle_idx = _build_shuffle_idx(doc_idx.shape[0] - 1 , doc_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) @@ -390,55 +362,25 @@ def _build_index_mappings(name, data_prefix, documents, sizes, print_rank_0(' > loading doc-idx mapping from {}'.format( doc_idx_filename)) doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' > loading sample-idx mapping from {}'.format( - sample_idx_filename)) - # sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') - # print_rank_0(' > loading shuffle-idx mapping from {}'.format( - # shuffle_idx_filename)) + print_rank_0(' > loading shuffle-idx mapping from {}'.format( + shuffle_idx_filename)) shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( time.time() - start_time)) - # print_rank_0(' total number of samples: {}'.format( - # sample_idx.shape[0])) - print_rank_0(' total number of epochs: {}'.format(num_epochs)) return doc_idx, shuffle_idx -def _num_tokens(documents, sizes): - """Total number of tokens in the dataset.""" - return np.sum(sizes[documents]) - - -def _num_epochs(tokens_per_epoch, seq_length, num_samples): - """Based on number of samples and sequence lenght, calculate how many - epochs will be needed.""" - num_epochs = 0 - total_tokens = 0 - while True: - num_epochs += 1 - total_tokens += tokens_per_epoch - # -1 is because we need to retrieve seq_length + 1 token each time - # but the last token will overlap with the first token of the next - # sample except for the last sample. - if ((total_tokens - 1) // seq_length) >= num_samples: - return num_epochs - - -def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch): +def _build_doc_idx(documents, np_rng): """Build an array with length = number-of-epochs * number-of-dcuments. Each index is mapped to a corresponding document.""" - if not separate_last_epoch or num_epochs == 1: - doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] - doc_idx[:] = documents - doc_idx = doc_idx.reshape(-1) - doc_idx = doc_idx.astype(np.int32) - np_rng.shuffle(doc_idx) - return doc_idx - - doc_idx_first = _build_doc_idx(documents, num_epochs-1, np_rng, False) - doc_idx_last = _build_doc_idx(documents, 1, np_rng, False) - return np.concatenate((doc_idx_first, doc_idx_last)) + num_epochs = 1 + doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] + doc_idx[:] = documents + doc_idx = doc_idx.reshape(-1) + doc_idx = doc_idx.astype(np.int32) + np_rng.shuffle(doc_idx) + return doc_idx def _build_shuffle_idx(num_samples, total_size, np_rng): @@ -462,16 +404,24 @@ def _build_shuffle_idx(num_samples, total_size, np_rng): return np.concatenate((shuffle_idx_first, shuffle_idx_last)) -def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): - """Pad sequences and convert them to numpy.""" +def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_warmup: bool): + if is_input: + field = "inputs" + else: + field = "targets" - # Some checks. - num_tokens = len(tokens) - padding_length = max_seq_length - num_tokens - assert padding_length >= 0 + return get_indexed_dataset_(f"{data_prefix}_{field}_document", data_impl, skip_warmup) - # Tokens and token types. - filler = np.array([pad_id] * padding_length) - tokens_np = np.concatenate((tokens, filler), dtype=np.int64) +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) - return tokens_np + return indexed_dataset \ No newline at end of file diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index e07e74766..b063e509b 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -1,9 +1,11 @@ +import itertools from unittest.mock import patch import deepspeed from megatron import global_vars, get_tokenizer, initialize_megatron, get_args -from megatron.data import mlm_dataset +from megatron.data import mlm_dataset, mtf_dataset +from megatron.data.data_samplers import MegatronPackedRandomSampler, build_pretraining_data_loader from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context @@ -25,7 +27,6 @@ def get_default_args(data_dir): "--train-iters": "5000", "--tokenizer-type": "PretrainedFromHF", "--tokenizer-name-or-path": "gpt2", - "--data-path": f"{data_dir}/meg-gpt2-openwebtext_text_document", "--data-impl": "mmap", "--split": "949,50,1", "--distributed-backend": "nccl", @@ -65,6 +66,7 @@ def setUp(self) -> None: def test_mlm_dataset(self): command_args = get_default_args(f"{self.data_dir}/gpt2") + command_args["--data-path"] = f"{self.data_dir}/meg-gpt2-openwebtext_text_document" command_args["--noise_density"] = "0.15" command_args["--mean_noise_span_length"] = "3" command_args["--vocab-extra-ids"] = "100" @@ -106,3 +108,78 @@ def test_mlm_dataset(self): self.assertEqual(sample["input_tokens"][-1], tokenizer.sep) self.assertEqual(sample["target_tokens"][-1], tokenizer.sep) + def test_mtf_dataset(self): + command_args = get_default_args(f"{self.data_dir}/t0") + command_args["--data-path"] = "tests/data/t0/ag_news_prompt" + + with patch('sys.argv', flatten_arguments(command_args)): + with mockenv_context(**self.dist_env_1_gpu): + deepspeed.init_distributed() + initialize_megatron() + + args = get_args() + train_val_test_num_samples = [ + args.train_iters * args.global_batch_size, + args.eval_iters * args.global_batch_size, + 0 + ] + train_ds, valid_ds, test_ds = mtf_dataset.build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + # TODO @thomasw21 figure how that value works + train_valid_test_num_samples=train_val_test_num_samples, + seed=args.seed, + skip_warmup=(not args.mmap_warmup) + ) + + # TODO @thomasw21 make sure that input and target are aligned. + + + def test_mtf_packed_dataloader(self): + command_args = get_default_args(f"{self.data_dir}/t0") + command_args["--data-path"] = "tests/data/t0/ag_news_prompt" + + with patch('sys.argv', flatten_arguments(command_args)): + with mockenv_context(**self.dist_env_1_gpu): + deepspeed.init_distributed() + initialize_megatron() + + args = get_args() + train_val_test_num_samples = [ + args.train_iters * args.global_batch_size, + args.eval_iters * args.global_batch_size, + 0 + ] + train_ds, valid_ds, test_ds = mtf_dataset.build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + # TODO @thomasw21 figure how that value works + train_valid_test_num_samples=train_val_test_num_samples, + seed=args.seed, + skip_warmup=(not args.mmap_warmup) + ) + + batch_sampler = build_pretraining_data_loader( + train_ds, consumed_samples=0, num_workers=4 + ) + + for i, items in enumerate(batch_sampler): + micro_batch_size, seq_length = items["decoder_target_tokens"].shape + + # `micro_batch_size` correspond to the one in argument + self.assertEqual(micro_batch_size, args.micro_batch_size) + # `seq_length` correspond to the one in argument + self.assertEqual(seq_length, args.seq_length) + + original_samples_count = 0 + for batch_id in micro_batch_size: + segment_ids = [k for k, _ in itertools.grouby(items["decoder_segment_ids"][batch_id])] + # `segment_ids` is [1,2,...] + self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids)))) + # `0` signify that the tokens are padding + self.assertEqual(segment_ids[-1], 0) + original_samples_count += len([segment_id for segment_id in segment_ids if segment_id != 0]) + + self.assertGreater(original_samples_count, micro_batch_size) diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py index 4dcceda9f..994cedf7b 100644 --- a/tests/test_packing_dataloader.py +++ b/tests/test_packing_dataloader.py @@ -12,7 +12,7 @@ from megatron.initialize import initialize_megatron from megatron.data.data_samplers import MegatronPackedRandomSampler, pack_samples -from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets +from megatron.data.mtf_dataset import build_train_valid_test_datasets from megatron.utils import get_packed_attention_mask """ @@ -75,19 +75,6 @@ } ) -train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=[{ - "input_tokens": "tests/data/t0/ag_news_prompt_inputs_document", - "target_tokens": "tests/data/t0/ag_news_prompt_targets_document" - }], - data_impl="mmap", - splits_string="90,5,5", - train_valid_test_num_samples=[50,0,0], - seq_length=seq_length, - seed=124, - skip_warmup=True -) - print("Test show dataset") for idx in range(0,4): line = train_ds[idx] @@ -95,40 +82,5 @@ print(line) -batch_sampler = MegatronPackedRandomSampler( - sequence_length=seq_length, - dataset=train_ds, - total_samples=len(train_ds), - consumed_samples=0, - micro_batch_size=4, - data_parallel_rank=0, - data_parallel_size=1 -) - -dl = torch.utils.data.DataLoader( - train_ds, - batch_sampler=batch_sampler, - num_workers=4, - pin_memory=True, - collate_fn=partial(pack_samples, max_seq_len=seq_length), -) - -for i, items in enumerate(dl): - - micro_batch_size, seq_length = items['decoder_target_tokens'].shape - causal_mask = torch.tril( - torch.ones( - (micro_batch_size, seq_length, seq_length)) - ).view( - micro_batch_size, 1, seq_length, seq_length - ) - - mask = get_packed_attention_mask( - causal_mask=causal_mask, - tokens=torch.tensor(items['decoder_target_tokens']), - decoder_causal_attention=torch.tensor(items['decoder_causal_attention']), - segment_ids=torch.tensor(items['decoder_segment_ids']), - ) - assert mask.shape == (micro_batch_size, 1, seq_length, seq_length) diff --git a/tools/preprocess_data_many_cores.py b/tools/preprocess_data_many_cores.py index aa674e056..11c666d04 100644 --- a/tools/preprocess_data_many_cores.py +++ b/tools/preprocess_data_many_cores.py @@ -23,6 +23,9 @@ - cpus >= 20 (logical cores) - large inputs: size >= 1GB +Caveat: + - It does not preserve original ordering. So not usable to `targets` and `inputs` + For example using a 40 physical cores (80 logical cores) setup, we can run 60 workers on oscar (1.2T) to increase the speed of preprocessing. """ From 0d2fdfd63d188989792295a126a67ecef502aa64 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 13:17:23 +0200 Subject: [PATCH 170/297] Cleanup --- megatron/data/data_samplers.py | 2 +- megatron/data/mtf_dataset.py | 2 +- megatron/tokenizer/tokenizer.py | 35 ------------- tests/test_packing_dataloader.py | 86 -------------------------------- 4 files changed, 2 insertions(+), 123 deletions(-) delete mode 100644 tests/test_packing_dataloader.py diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index c3e305b2f..2dad2fd30 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -131,7 +131,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): num_workers = args.num_workers collate_fn = None - if args.dataloader_type == 'packed': + if args.dataloader_type == 'decoder_packed': collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1) # Torch dataloader. diff --git a/megatron/data/mtf_dataset.py b/megatron/data/mtf_dataset.py index 8a4839bf2..044a4ab3a 100644 --- a/megatron/data/mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -424,4 +424,4 @@ def get_indexed_dataset_(path, data_impl, skip_warmup): print_rank_0(' number of documents: {}'.format( indexed_dataset.sizes.shape[0])) - return indexed_dataset \ No newline at end of file + return indexed_dataset diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 91e0ad1d6..5281d106e 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -387,38 +387,3 @@ def bos_token_id(self): def eos_token_id(self): """ Id of the end of sentence token in the vocabulary.""" return self.tokenizer.eos_token_id - - @property - def eod(self): - return self.tokenizer.eos_token_id - - @property - def cls(self): - return self.tokenizer.cls_token_id - - @property - def sep(self): - return self.tokenizer.sep_token_id - - @property - def pad(self): - return self.tokenizer.pad_token_id - - @property - def mask(self): - return self.tokenizer.mask_token_id - - @property - def additional_special_tokens_ids(self): - """ All the additional special tokens you may want to use (list of strings).""" - return self.tokenizer.additional_special_tokens_ids - - @property - def bos_token_id(self): - raise NotImplementedError("Missing ") - - @property - def eos_token_id(self): - raise NotImplementedError("Missing ") - - diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py deleted file mode 100644 index 994cedf7b..000000000 --- a/tests/test_packing_dataloader.py +++ /dev/null @@ -1,86 +0,0 @@ -from functools import partial -import sys -from pathlib import Path -import os - -import torch - -# Insert megatron's root dir into sys.path -root_repo_path = str(Path(__file__).resolve().parents[1]) -if root_repo_path not in sys.path: - sys.path.insert(0, root_repo_path) - -from megatron.initialize import initialize_megatron -from megatron.data.data_samplers import MegatronPackedRandomSampler, pack_samples -from megatron.data.mtf_dataset import build_train_valid_test_datasets -from megatron.utils import get_packed_attention_mask - -""" -To preprocess data before testing - -TOKENIZER_PATH="gpt2" -DATA_PATH="tests/data/t0/ag_news_classify_question_first.json" -OUTPUT="tests/data/t0/ag_news_prompt" - -python tools/preprocess_data.py \ - --input $DATA_PATH \ - --output-prefix $OUTPUT \ - --dataset-impl mmap \ - --json-key inputs \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path $TOKENIZER_PATH \ - --append-eod \ - --workers 8 - -python tools/preprocess_data.py \ - --input $DATA_PATH \ - --output-prefix $OUTPUT \ - --dataset-impl mmap \ - --json-key targets \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path $TOKENIZER_PATH \ - --append-eod \ - --workers 8 -""" - - -""" -Define Environment variables if necessary -""" -os.environ["RANK"] = "0" -os.environ["WORLD_SIZE"] = "1" -os.environ["MASTER_ADDR"] = "jean-zay-pp2" # $(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) -os.environ["MASTER_PORT"] = "6002" -os.environ["LOCAL_RANK"] = "0" - - -seq_length = 256 - - - -# Initialize Megatron with dummy variables -initialize_megatron( - extra_args_provider=None, - allow_no_cuda=True, - args_defaults={ - "micro_batch_size": 4, - "num_layers": 4, - "hidden_size": 64, - "num_attention_heads": 4, - "seq_length": seq_length, - "max_position_embeddings": seq_length, - "distributed_backend": "nccl", - "tokenizer_type": "PretrainedFromHF", - "tokenizer_name_or_path": "gpt2", - } -) - -print("Test show dataset") -for idx in range(0,4): - line = train_ds[idx] - print(len(line)) - print(line) - - - - From 126fa34c24fe4f98cca4bbbfd1d3ba0b44d6e4be Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 13:22:44 +0200 Subject: [PATCH 171/297] Add back all configs --- .../gpt2/ag_news_prompt_inputs_document.bin | Bin 0 -> 12038 bytes .../gpt2/ag_news_prompt_inputs_document.idx | Bin 0 -> 2042 bytes .../ag_news_prompt_text_targets_document.bin | Bin 0 -> 12526 bytes .../ag_news_prompt_text_targets_document.idx | Bin 0 -> 2042 bytes tests/test_dataloaders.py | 6 ++++-- 5 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 tests/data/gpt2/ag_news_prompt_inputs_document.bin create mode 100644 tests/data/gpt2/ag_news_prompt_inputs_document.idx create mode 100644 tests/data/gpt2/ag_news_prompt_text_targets_document.bin create mode 100644 tests/data/gpt2/ag_news_prompt_text_targets_document.idx diff --git a/tests/data/gpt2/ag_news_prompt_inputs_document.bin b/tests/data/gpt2/ag_news_prompt_inputs_document.bin new file mode 100644 index 0000000000000000000000000000000000000000..b786d6e414d6c5784520a20da2f254eb9c3e7add GIT binary patch literal 12038 zcmd6Nd3=m#*Z*~{dzs~)d-j=RlF3Xm*^x~WqLWMrNsT0yh}aSlNr;pXJ3&xEP*klg znxd9c<)N*mDAigXEhV}T+P<}vwogl|8|6J$^7hsD_e_4D&->r|htJ5&eP8EX=Q_*x ze9t8-{rvCoK_NR~ySPH_ARmy~{gqTE8ELB~g`1;#ir`k4Ls5c~d>3q{10yx0zkL)) zWU1+loCh!Oc9V3S19o=OkbPV(8Ocec4gw9|>+a(ky-&#hw)(+{_7DzN#_`2R;KYE- z+n%ibvpd*ySGQzwIK-PI;wGz%jR4>d@j|6ImeY_qoz38a2T^YFl(yH~SItfkVUJcP zNu&{8PS6|OWEK}OE*)Our&9<1Hc8=IgQ}e_Nb9~lQb~W)h-BHsCmPHY4v7g*?KkYR zEbi_mTpJM%YhAcfL$(*xadj|FvV)u4nq5srIHaGX2oMBeA}_+C04C2(K2M)dFF-Fi zu-mufn^SddmvI-`y(etYZhJYPV_uE_5i$J3}v8+~oIYwM`;6_)$71nrJ#?>U>MB+PUA5cEk=>K|`KZZ5Abx zT3k<0JZA^3a9CRo(cP4!(PgH|@E-II{+jzy+Ha6Z!JuA32ZIaV=Ozl<#!F<40-&B= zmQFmE9Wt1IPEo~)`2Jqw*g)S>f9HSGtT5MUBr=)%u;&8*RWL_I2e9hc)HGuyBStjKWVgFN-Ni0$N^t{a#=DLsao$&35TYlxRR#K7;vYD1Pu(SJ6lLwpq z%f(3weR7C-iKYSmlB_2(v5Id>%-g2Rf?)seU;Ib~O8Z@RxV=O~c&^VGtKMv)iCW+G zu)kWXs^>n=tfu3l%VDZCU0AB!ZFqt3mOrAvEOJJVrf&9oV7%^+!eYs}l3%<2T-7``yhhd7sgcix~ z4q%^VdSAIO8Q{e*8LcT3CpA3D? zu`~BESR!Qf5{cA6kxn92Fud&m{u|RolvlYg2EG@zKm|Roh^FKD(WtMltn%+UtBEX3 z_+xa7$N9=zO$7A+o4ob^W{v;3{@*i46LDdF`1j1wMD9C%XO1Q^Soi>` zd#~;dPzf^eBR7R6@>Ay1w16*%E@a6>C2>H)QxdtZ%R@PzJ598SWa3$jwq?P7!x`#T zKJfasygOAxsx`GzxN}leo#usM8WKm|q5sNkBFErbl zhmPa99W%c!%@R9B?4Ix$+6drb)E4*@ZPp&+!jK-<)wdp@WM`r}D;#TEI$TJc9sbPR z|LGsKu+2qacH7h4nVcqaoE8W(4FH?Pzt(QjIAbw4sjYGo8EFjbnvULoSeplDgIuth z&QsfA2EMmc(-rq>RfsTAcRI+oE(zol?TdOF^ssh>4A&iQAWX-9x9IPaB1_w8KrFWy=qqDG^HPUaH})LlB># z_6Vc#L?fIJVUeR4EUoy^REgRChVWAlX8&5OYV7Yu@OZa0`^RQ+k<1Z-x!#X#2xGNr zojfthKWl+XBHb0tgQm+HExz}5e$X*$8%B$5m5oc^=BouP?P!adZr!EXVc z9r+e!%s!Sa5Tn zSA;YhU?p%ul%ESO`rk14UO`;*yDJ^O^RMl6M0H64aWiyvP`BO=k5QtTG~h~~speU( z={Z>LSlgKf(PGIl(UYB;9Z?8O*4j%<=7ymVBDdz8@*dw^QS9zR< zbnUFtkS@i?A|-MOec-Ncc7X_MRc75+P=i(A5n8HisPA>#k-m}^UZmgezN@=wtJDot zFk~eseZRX`dzQ_H>rO;h1v(&L-)rcdg7CCKt9`4R3oa>ap2}9RYn`GS%x(WL{=VN9 zaF!0pz2p$_lSNIf(+=|lFZ`)EP6axesuc`%^m1;2e_@6y*xNOTU*m+Ct`~EBYdfX$ z4#Qnthc|xltA$zOA(aS2O*;%~;|(P{vG_L2tsbz$9>r=}Fvbg;w1Zr}*O|Fn5AV%! zVz;q8E~a#Vv3N8)p}es7=G?M+VW0uK2hA5^i7P-+V@3_ZWwFEZxQ|gLvkA*^ZDR`&9WV?8EoLz48^Ks#7h2CCO`q^;UA@C=&_T~4lK`) z2v@(pzm>SkUPd%qZSP6UWAvEQ(xcK~oA8{(BI@mNd2lZ?YK#i}QPO?=U(b}oX9sc+ zS)<XMKw*C0W|Ine4a%IrzsYxxD;HU#HU{5 zwu|-n8Liwd`rchIF%7)kBM+ucTNh~>c(3nP?IUo4Lo<(e!9n|HLLA2Pul78s(61Kv zM@ZzQz+1{KP(3Jz)F@rx4LC?$T03ZF;?qzydkl76E{Ib9j5U1@R^x9=R*Tn^#cncJ50l7ntd7_)GiQqmtRb^>`7(e$?JR^VAK`E1x;~zX zsP*SjZNQ`w-@)JDS(h zCfe+fjB)R^u=RAU#P@XTJ?<3tZN1hw;4Sqrt%5s?9r^$@=Ajsgl=wG5c>g7~%3kHL zGSL6oj?l?=87r{6Fn}yo$)vxK%GKZr#AGrrY(x)~wC6f9e=kFu)-_|j0qT!oTn5|8 z+88fP9LTbpKtr#JQ}CQdd!*(_>gT|24F1WpomA?sV$B^-N9(d6A0}el=3qDcDSG0MZr|}! z^u{c48yI1kVl6&3HPD85KMj5nvdQn68sgGi#VGZFT)j-OOVJ70jVfUR^!{<2jlplxj;tAB8h?aYpfebG7^o8J*(h zN`C1bR?nQUo*!y27cGc0!%A4y^u7P2(4qDgy#mOKCO_@Z(^=l$$@L)rcXSL!%e>?n z63D8zpwEL3W8Z}|SncHb{cRp)k43IPgk6f0$RJMR9#apP zDZ`Ctuv76Mw{roCVTa+Eb~G)uzYR)54=tr-iUDsi-l9t(=NydQJo^BX zSx-;uf{hORRwDQOT(nDJ9yGLFOdljWVX$p5qJ0P<#a53#si91j?*|q=t<%~B<88dd2=iHETgEW zygTrpPn`?A!_D|b*-v8k?t1HaFLT;(TdSKs@nQ$*)p##KDL6(*|hD*;}! z4oAO=3dw`rnO?qAln1-h5+TLRy0#$fj>pnPu`c|-m(N0MmA(=4c$9{eguEJ|#ePrwTKznHj_8XSd-*Co&ch8Z$Zs@CWNbRW<*iaNvrtoDr(x*C1G|2sd-D#EuM;aivS=>)X)!;AQ@Ka9UpneQa6n zSj9-Qm%>yhtA5Nsn#i`0vB5)Bok>uL2y?YnI(tw9OjICZYQU#yFS(jqpz0brnwB9# z$)+D-H^$y-S@lmE(af@5hd*gP=&;%MrOmvkXH*WzkpdGXGSu?E$wp#h)8W$zYl}Nz zEoDCYSeiwdMn)y8wR{qb^DKA=J;Ym42aVC4c_)nLb}D|#lrSz?*LIs{nD#7O;-XS8 zegje|tLlsVFTxV6)B?U;HV?oWy;696>=Z>h{+1~oRI^?;3+91#IA~gG<45`T%wK3d z$m;+MLabs+tVu`K)7lF$QMKW*q#-L6XY%5y@11SV*=&6vvn;b#^+nJ(e1<+yTbSJw zy`YDc{UYNp4Z=D@6S+`egvh`M^uXf^GseLNtZ^djM`ThX#bV_>5^BZS5Sy9sb~C*S zk%+}utgSO-K?HRO#gYSJ!I))5>-)0tK<&%I=RGQPf50NK7P8g33L8mvC{jmYK1m9= zp&G^2V>Qj@YGHQ3ApR%S1}GArw}hFnALIsicEHPc68=M*V)2AxlO6M!KNd%1$i_-%xQYM(h(`GIg~H=-HXXjre<9oJt+- z(2!*?XV^v9Vs=4qE^@-noXLCx_6oD(*IATWkuYoYk5M27`n=?_m@Kor8Gya?j(LOE z_}=SS=b^SQCx1fMqkgmXwJs}!npZ^U5vJ`CSNBl+bjMb z>vSIUHmmMj{Oy|cZX06g`r-9T9p49f=XdSLH27!zbmuT}Ah*c>3U7y%$Y-Qs@5z4I zhaD)u?HWF5iXu&%n}KtT%q-0ReR)lqO>|MVhTnEPJN@JdF6{oULOQ;6o?E24>@$mf z(mvJU-Tbk0zr7ag=>~bh@l_VK<9mFnBR6|jQ6ns+b#bYf-LEv)i^41xIbQU~Vkn_C zspU(Zkdm>K?!-x3+`29=hDD4C#hSFh1!cl!bt-mODH+TbgJC!FDotcox=gYKBi2=% z2BCkSlw1%bQ2HCSQqoHGla+PN8qG3#Aq}~(G`oW^)?*{Brwxb}j z@bL0MAfXKw7;$Q#8AAIanre~Zf?|E<8zS<#(Oe}z3MwgPJpbqTA9xlcuPQ_l5(dz{rx2}-4HP3 z9-<)^EM7dDXMO1_+Ef~?OtYvX9f%JmmHh|WSE!KF;P;E$gt=kCbZ^d68j*kxQH1;fUcZcJ3Q^_EsKd(p{=;tsuemofGvv$hfRRaaH^nGy8uN z*$IMh(Y^~l6KtUJUx9UJHN6$fo*1l*cCv{00a4;VLanig3Y*`lPkUo~uIn0Na92M3meUC%r$I79yYOf&&3dV@iYpY8N;I3WQ4SUol>YIfU`vE$eAo zLCHxtFKI*A#6|J+*YlRc9{7>fTz-zgIum~qLdM(8-c{gtphT!`p0Kn$9j8jxoi>(3 z4DmQ|E)8Gz$5Tmad7s(P4rR(oik=3SnlZ_bh3z^oEY@7svC8+|PKnH+Z%XAji)S)$ zVng190e zX3tqvCwrKkb2qJsW7Y3=be0Cu8RE_wKRfbMmoZz^LY1u_)=$Q~vnMcqvsXttF;n@L z9%oK8?ABaEJac7-oo8<`*$)r{xWh)l0p+h4u_Ejh{P2XY?6^gH;o3}M?k1TW8u+Vi zseUf)5`Pw!aF@8cXgd^X59wKSeMo0vj`me)laJ4@F|IGe+HiVE8uVAY^4X1p+Jsj! z%k&LU-T~k%{a~dNen1R0U7n=4y8!Qm7;J>|=%l?-c_hfN1->|nI`H8Zx9zLyCiKSM zenB(yvp&I#0Z4L4=Gf<;iSS2LTU0R}1BFe~FKNh|bUCq2z^6=v+ zr}gqInH*7T)u%%D*uIVTz5h6}QJmYhyvJ#=sz(~uwGoD~A^B~ur~twjIiW&+T~d!^ zCmkTLxr0mq`JjfLv(?NSZX{MBIvp*&!29l`!pSOyBK)C0-@Oj#@hFUUH`&u0r;@e;#3?7yZoV}=nrH_DUn`&DMC@*0 zU`%mXk0)9(R@?zsrR8e8krO;rw`T{-K;@XgeIJWYr$eIFgw@YYhDEQ>Vl$=N@h5sL)G&;k#mx}i41o-S=jw#XqPEw zV3$;Y)}FQ)ZH*Q<&qd-anw{hY^pg~!0IOvcoW~utCo3RKy%B(kJw)K)YP+L1e&ld$985b1&e%BS#kzI^$%_1E(ZsPIW68FOvKa?XUaEP=<`x zgl&PLPi`L7!jgi4gLhPU!R2=*{t7a$7yLR{&LUr0Vuhglcmu;vxIZKSYy2$w04JKs z+38TG5A4u#kKuNFe|2=btF2UDZNWED{{DA_=i>gpXLgI=D}9o}P5#*XuBMvKP;1B| zqv)_hM{`uT2d!`}tRr@eRXUdUef!_tb}u?TqwGZs`}{Mk9Z5X0u#Zg6yv}A(dbID* z%=`{YvN1frgPHD^%yfcu3-Rqxi!V>Udw5URoqpoH6Kzw*KGE`WcNRAl`cjjdtacBG zIj4KW9gS-+bmzqLD46#S5IMgr9WHmz!`y#Xi}wM?vySr~cIP7Gm$4g&g?LJdu%2_1 zYGexl)~nw!F)wUwCc4;@2PAD~-0io4hLT5kN6KLxqg0Zq`Y}o~t*Bc>$0+9Ew7#C+ zHK*{$=-a(S@&%XAjm+|GrSCj!c6Xc~YTE3dp=G>wIb?TybFz}~`Zp(=Y0Ejv|PoY zIe4-9Epef^V0Z)c$C>+9?NXe^C>8>o)aJt%9Gi81s|WZQS(Zz2e|a9y6=UsV_ukI5 zb-{$B*1WCPNQo@(Hq3DYQOcr@3;dtx5UgZTRES-ghg*vCkSbxF@!Q^hp2s*VdRtt8 zH(>gz+r*17-_jHBx=1($v|7Kx=XN5OGMswwCMnAmyHw03`SU+hKjKbeA9NA#&H3`E zVddu*EXV9v6xN|MKJ$qdX~-+T*uCzB-`gP$tYtMOe{~PYvF7!es2vkF5eC?C#*Rp@ zDC}azLkOj;7HPGZ6SYh($(s^ZppnT?WH$C&eM>G(FK?KgRsp|hYM_^KOxWLLXUJyH z9bGVdDi`Bw8#G@ka}Dfn)lokcGQrakP0vM6p*x*{eSnBsXh=Ahfmu}!DnLsBe5qvj zJAKPNzk{EV2g}4tZWX+X(e3T@l#rA7M7pA@2=Du|xOj7%L=Fm0m}0%N_rnQx^noJn zNaMPNGPzXttO0LW#ectN5In#=Dwn0X8=zmr76`PNy7-oxh!yQ{=GwOV`Pe7*5+us!r%vV?1{ui{cvK zuyO|4;R@>c&ED7gRl)Y++t~9i#@Mc37)@^&TPlceU5>1M6s~}mrZ8qMs^{~Tueg_U z%=B3N6K$%eC0nQYc`HI`K5BES*9OX-`4`re#E1jvFXs_|ol(T2v`dz$=&OI-$+}@| zuVj_Wx7<*&sdH6F5`5bGjAhT??nqx!uf$t2F6@!n>~|kBNN$ox!kX*)2xWC zakezM>pIb~&I>=`ZQZ+b83Q{Pk<8o8d6xAV6Sy4ge%vHW7#h@i^(ag~)9sT(!MuK1 zV!bE-hS)7cB7DwiI2>CJ=kz^e81ht-y64%ancDdfoLXkcH;jETHn!sJn+#PJ-Z))^ zcjS^>pKgetClTGTnGL&PrfpYE+irBiSLD&L`6ivry>ZrB0ToR1zE)vG!$wqh_SFV#1La)VXTa?rm&ze zR1}8V%2-ekVyG+(F%-ta`R{x$F;4RHotb+%@7#O-JEx|qA?;4$Y# z*E!c2)Z)D0s&_qbop7CZb-GkfyBb|(sMeWsmAXn?M_l_|M_t7()iU>Lta>3e{a=Nv z$aUP6`Q0ub54p5nt)tS_=DO;ta%m2++_l%G-^)Et^-KTea7U{ainosN1 zHO-lcn+)dQKvoRCz{X{fe}g^CBcFwnD>t0JF(6{{nkhPm01KZZYH`ZPdr%cUZA5>M!6NtX?1Wmv8~5Hbi|4&coJ?QGWxM;OM5P|Ay~3M}B`xFts%} z3G=s+!@lj2&%lBmk-vhAuq-F)kKh!{+Zpwza2i@})SKZr%-t3BMffZ)@{!%aKX7bM zA9-qD@GWdCh}}ZDu&-fP!=8p64f`2(Gi+wq%&?hZFT+-boecXJcCr8LVc5X1fwBF@_9s0} zkIS|j+ieBB1Gd-LR%1JpzNcH*#-t}IXWNpVq@3+Zdb4u2Bk9x1dF@GmQm!7QP>)vV zOi+{u>?BmbQ&a}*6s&^Pum+xn>YWPpM1}gHLg$V`=Ypa(`5R}JhIQoiFbx}EBW!}r zum!flHrNh3U?;o;)$0{k0?C=7VHf!|*bRGNFYJT;Q0I=~dcX$Z4VXCxZjs-Hci>$( H1c%{YQkiNZ literal 0 HcmV?d00001 diff --git a/tests/data/gpt2/ag_news_prompt_text_targets_document.bin b/tests/data/gpt2/ag_news_prompt_text_targets_document.bin new file mode 100644 index 0000000000000000000000000000000000000000..60646247e5037a6b277473adb47a2864b90408dd GIT binary patch literal 12526 zcmd6Nd3=m#*Z*~{dzs~)d-j=RlF3Xm*^x~WqLWMrNsT0yh}aSlNr;pXJ3&xEP*jy_ z(G<0mDi3WfMXA>EXerT!(DtpRbfwje@;g`ZdiDK1&*b_1{`>vmGct4E*E!d@&hkCq zbID3S|2uq8$adH&u24J32V{1CC6!4=+R91c=BS<`xYgxQl%OQv1)J%>NDb+49|aOw zYC0w7!OJ_{Bwgo#?VU7a50^_uauTV7K*RUCySPU0W3LbGeN6rr3Hu}3LpWR-#}^-h zV*@U3dA$BF?qJhx-IB%O5O0!*o2)c80)RWj3zgznPDAE&HiHZ9MY+in+Foy8HakIt zU0R(akw$nqL2q=ESzN@pba;)QP96B$B!zD+sV6a*O;p=A;i{tqF2)Ia zg1hOCQa9OFyIBixu0v7YDSF=GCVxb$Z4$A;PtqCDMAIQt=UZ~s_PvI*Lw2|f8uGMi zqbQNo;(B`Q89Qi&gW7V4?xrM-E;CJr_n>$1*W6FiUV}sm2K5p;7+ml^H&NI!ULr3j z0P5*Q>DV*bA%pp66jhvv@9#E_4fHMf5B_(}3Ui%CB9plfdoJ)_33J59duHcQXwVuL zsM6tCjhiS9`@@e4Pu6*H_rqx^?rpE(^5R^viTDZW>HzYV>?WN_PN+oI#!Te^YIP0S zn;=zN?{Sk*@;03<%cLA4?0-ozi6u&(-ng;CT=!AF6TTgB!>=4xOG>g%Hq+7uc9tJ% z@?fKXxj0FoPYy9J(KNs#>3Sj)tN5nGJaD=!2=@Q^#g9~=#6NTg+e<`*XZoD7>dhva zsP%0Z`>VC8dhX-QYC0~u9HvUsg{9iPMwu8mCD96x|1#5_6&3n|X5_UvxZ{89Q>m1QhePXmj#69$Dl}tWeZ|xO9|IR?hwb`Q`saonoQr;A zI;wkF?96=(mIxU=Mj|y(q?1S$3~xJt|Hd>C-=k8 zYa$C1{v6%nf&VVIH4)JNzvj08JFEO}8Gp|#O~i%y<6kpN6S?dB_nD=M4A%XpyizA^ ziT1**ckQreP_jltel2Wl(?n8|g1ZL&cdVJ|tanetIViNS zHJqYeM1CQBZFzJaDIqOS{v@gR zZ?th-fU=2n(VjKg!C$+vIX_Ee*1IPZO=JkL6)YRF$OqEG&Mg1!z}WZY2k*q>Zn&}d zHH{rUDeh-)BIDq@Y%O`F!(&gVh2#`39Fv;JY&ln*N6zS0rYzAv$Jh9c6;i1k{kw@6 zN4~_dJarV$@QC?!X_nY2V&{a<&{hEVqc*{(XuI|pD~9yAuD*2%CEF9#S>ad@)8Sm= z?C_`N{%?P&g)J@uv)i8R&SW)_qqIPnX#m(LK3cy?ZBc}B(I%D%R<-Z7!`J@V(F@^b4S7q@5aaM#ARS-e z9k4>@L5u#_T(NHpB78|(o{&-s#75X7PQcv0QPUXseYEej+v78hmJxmuIUmSSMZ31< zx(LAo4q=Ph3+@z_rCFs`ald2Ux&$E(Cmd5~$QM~GuWyD^q8+aDEUSL-REbbZcT){_ z6@vH-wMQ6@Cmi9t4~rbdU}?p#rb_HAt_eT)VCPYb)sX$&2p;dIW}n$8E|NJyur~OS zbz!WYt&t~2`DZOqNu;}idDV1zy~Xz)&kj05Z9@{_U6mK&gG6$!ozs7Iw^K9`O1S&F zHux>T(<9%)?Aph&31am6(D)#hUz>=exXkr!rT(ypXcKU0+r4BBd4;P5OVN8$VkV1# zJYZfgk;~fG5Pem(Tim^x+Mq#QjK1}1S0`NB_GT;f-jkBk?x6OApouz4`=aP_S1FRrUi1Ks6dH-t$->Zl({NYN6@BC{!9a3FTK->&n9n`J0!$Xv8CJngS zXR3LYD|$|rNE2!)f?4B$(Y8+mML6-A3zS0={d}*!KVq+8822W|o4=+OB1H$h6xg4y zr!F|k|HUmaEz#WbvK*f)E>pC_%EqwNN=qY{c;9Odd19n#bX~PDL-(0#q$v-WZr5$> z?*=@1v+lO_0}j2YS$`*f^GUEBrfZpnSVr)@?mIEbDfj(YezEh(DGr!#$Jjt*A;Jr~ z*QIEUlQJKRo#2I1s75<8FHTpl3zNtZwN7B~`CglLoNuo@kPaUOZ54l*v=noHQS2d< z;8h-_AzeGGG^9)Mkw}SLK;O8nn_VEnYL!{{71UrAd4N{y8tQxPcBHSQg%|1fJ8$c* z+bVSf6%4V-N#F16)t+U+;kskdRe=r&*z+2CuOK{W&}!f6=7I|fo2RlBd|apK26Nlr zkH71;37n+^axXYU{AAHp>$Jl>!3%#Wj#7b+rfLO4GQFIe;0MfV1-rWj@h>=Grt8HV z-&#-Uyv=Z1*Wry{{c2&Bct9n>P}4Sp+IUULPBFgCa;pdIuuHLu7L4)22JIl1@3m&` z)WdsooY>_okBccCU@RWZPAxC&y*W3mUKnV=jz#l@SmFv$)R<9Ia9QlIJnmzZ%k0DQ zq3^Y`mmLqc0}&ozp89&;ef#n7@q9jbV54k@D+U|5HAAt&84(@)oC#2XdHg478G7_2 zvFfX4`ub^Cms!!}O>$*djb5vB-OSTprxXj2fc?f0TAt|LB== z_-tPeB6;+uz@s$7a)ov@6U~NQPGI`^-pf>rB);YMbU>Cd-@DYlvii!yuC_j-OocMYX*70$z8*s2I8zr>da*9&Xl;*bP562bb8 zV(B4#7Q5W^6>RU2q&677D`g0Fb?1k$Txx`Dlqpcll-;dl5SEi|q|(m`N9c~I?ee?2 zBh;-L&|4%=U|ssfRY6&Ni_(0%1EZD6WO1IVWk@6soc_rv8gf3U1+~Z|N;pOL^S_~I zCE`;ra$CiE{ESv^6@BkAn3x9M?2!l4rmcxI4ZPELv-SZv!J(PQyI{ZlGa(LR|2KOc zROnZUdm|+BQs52cCa4~iLu!;R@CNLsF0CCjGx2Gtnmq=)J{Lr(e?hD;2P^!yC9A|M z%3?U|H-s|38PG}{5zBb=+=W7Lmhg*f-93|x8zJ3aF zKvM47b~3qUbi&jWiQI?HkRYFs0J5zIQZ^~)nQQO~7j&n~)h>FNb~3EP+$?do97@z< z3e5u#B7z|_>%ZkO93sL9Lmm{y(pGY5h=)mKxLHSRnANjI1=gWix_lWxpLQC;l@IW@ za$O(KMAZEAsJ3EYP$lgP_voIH;g(G966V;x5Z5gqPbzM-6Z-RaR4y16Fco{rzObJ@ z&$Yt}v6=4BOy<9)lQWC4YqINI5GxLVFS*}fRZ>Yc(;DEQ(N3;s=G%^MQ)R?bgBvRtv0O9?Y z*eZLK!}EduPj`e)wu@NJ-Gu>UsY)jOg;cHvPa-Chd1WJdq@+F5k;On6+P1D4PYzIj z4C6A`ZdS*5Vd6lRy(RPv(H>&lub+e+Mdy$#IH#A$#>gbniJV0a=5m^bL`!+(6upZz zDXN+85$)PiBtZDLmk=R&+u3XwIY)iuqV;v*|5m7RqF-M;~bVJljpB?lRWx@pQB<3-Vzi#&ZsK<)5Ng z{^a%@Q$=sg61RX6mMK=_lT!n2*kh!@uR=EYBU3|Mn#&ly9+0b-DRwA2A?s2l4CYge zOVQ$9xb9FNX15Ea%+RH@?V%e|~R&jmXAt`jIy+yYG@}kL4`^$8eu@(c-N)nCx(!H2Q$LK>`c^8DU553T+(%f%@($I5DX_;ccTZ~ibQph<6 zqnFR#$K=-27Y-Q zx>inln4)iI8>xl0ZRaIics^LkP`VTTKz4SK2|HBF^YluD9qPGM`VdrzQkd1oxL+y9 zo=;1SHo|(#kN6_3+T{TA2H!h-f5D#d+p|x?fA!1hSoo9&6t5hmao7zciT{w82Os6l ztzf>4qMq{3z<)k*Ch!(F*Hft}6JP0NrK6O?&{Bu#X&4A5&WeoGEODOT@X0I$ci0Kf za$O1;vde>iAg9N2LmsSxu}@xxYFcFfnv;o5br~G^eHP>@KS4YD-cy|-I`<;3 zEYx+)jb;8iL*XE0nEl3CY-AvuhA3nd7`y2OE7v>Y59&*!vv7hdT{>UihSmo!^Us5m z`a0-i%WB7}PMW0#R54K23Yc)!YJA z*U-_l3=vN@{SZ4s_EyWPf6<6$#`r4yMe{+2WP~q`=0!cFazKt0m?)8B!z%dm+ZEHawIxC{NESiuj4T5%S} zW<$K)OfN$u;yV_LYYbTsK^;P|MjVUbu1+3H+{jifph zsUtA=Bn4bkjpFLDDra-GFgsun|Fdcx6p7DT!c5qYa)UcN;AK1!|DjE>cv3hsJC%Tc z(Rer0%A>eJKVR;mfrf{OM9*2A@O$W0>?m$Wx}e(1PA>`HP;o0p>=9owb+rlT?U}@l z_Awc0q40a>DhT$$SI$BeUbzSd?0kFpKn$Q6Q%JyyT*oEVDct zfIa=Td4pf@z2C9U18rYU{)DbY9cSxnT~-J+uZYegOyfhY?xFVSSS{Qhm=fTEu^iKq zc~>dipV)xt>rxNqZ!PbG{s?VL_R?}IY1pfIeF4Lym77QbtB_l@0eU+|ur**^yhPIQ z#9WwZwx;;j>n!MPR^6HSn>B0QHpJ-l!|Rngz7O=y@7j-P@z46{&SBy}Zjt{b-VVG;-J zZjtJu&rJ78`&5T_*NJTS6bN4@9?RP-0U4ijj)u~#ie33xYS%@3bR-= zdC@F?A^Nh_68R@OCZHOuIQH018m><+@1nDw-tHX!1}m_gmA zmmm)lveM!KZ}Se)#73<1tfKYB?u~J>iiBc65fYFKEki!*@B)dXWVnE7Et4I7R?770 ziX8x}!_S)bV=heAFbO{^@9>SH7m_m-$UL}kb!Hvkr+iOw8FWRioG0M4KkH{taJ%un zHk{M{GE6Qn5rSob-Wnzf1LacKjq{Flspb1nx_rDHzFpYd%@DtNGG>XLMmyYgN<`Ud z6qMK=;9g4t4=x`B653&b5ho&=A+#@|vlbaHDAs4bAtEOo%~kTFpps%X^nZr`k!Nx9 zsv;!vo8GPI-RPY|ZRC`0y!{!&uZkk&_q64I4st=wm_HGILqBx-7g;-|Pe7h7DR;Wl z-(Mor4FN;$AhL47;>ELh)(gL)O{LMwG~1f?4N!-+V&Ah)dX~5=jCtn8L0Q1ou0!M! zhF!>bo;~p-T#YK!S?ljBxK^^+zW88L*?*vYg$lV9ey_Mim>U*MuEa8iq7n9mixAjh z+PL?1?C)UXvj*g;kVCke7a2Af`M%j54jE1(Yp{-IZ{=a8-J#mj3c6g|IZ@w(jLRw% zSH_Pqv;Q}d?H~x}?K|Kz!3HY-6s)UZ}Xx#ZoMBqJf+Wn(xA@aa3*cY%grbHN^c7ZdXK&aF{iv2>& zFO0))Sx?gn0#Cx3P#eOgN{T1Hp0^x!!B4D4^K%5&0r`^fU6shIu zI90Olq_G@gh{uWZdHA|No={TD`^=VhC{s>S^fb8Cj0t}zY}I*TvF4(VRmtzROJojx zQ!2;12}}k~rO2B&=PpE!qie`28ZBIwt_Jw_y+7Mb>1HFxq}&Yvw}hO2`SMkc%?7T} z{RJZ>HV?2jPXbu9CwrK^b2qGrW0mq|be0AY9%9%UKRfbe7crC6LY1u_)><}2U{7FN zXRnHMVixnQL7X|!uv2pdanPl0cAmY-9%qU}(mJ)mb%_aU8ydE8fJPCh!n#<;c&>&VF=Y0zKo z%4at*Y7<_`EYmkYc?W>6^n;a7_!05fba|5E_5!>&W3UmcNLw z+_tZ(o6tLZ`vuVeJOLLn2EHZJj&}Gt=7{H*{V{9)6>`taM~;lRjvO>{`tmHyp-f(< zjE0S)F;6jmZ>sU6{_k=Y4r*fS3NuXzN)(`L?@DtMqaz&&>mW7|x7pjp{)z%pR(&FLm+jkl-*$*2>&3Zk%X^#@t9qni9UNg88)n1<`O zv>64vI^^LUCK2=_zZcT+b0kLoC8$H}I`!FN_}nl#{%J=`jT7N}`T`>ADX<35(tvo) zgDlNj|7*zWPDi#}1lAdea31@DL+Y81Yuc{FI>h&GMbeJUT)~*$J2fJ?7SY4)g!#Q) z6_4-8eCGOmjS;&Fham87#LOe)UN~b~j&>Iz4sUM_&)pN4fJ~GdXROmX#!G3qDqyQ> z&F4)(28zESnl8X9;3f-TB>fVruLytY&vvf^dNd02ft&2=jni0L0b-ovXg{WHuaU^S zN&oI9*WpAv82DQG1Sewm1OsD=!+Jc{mNEJcd~>-PZ!rZA)$Q8GGF3T@86SfY5JOVA zZ{-{9!^21QaT5jkSlJH0-Dgl@Y=lP`IL4%wLkPKMXWz%-)9H|?HDT3slVQ$=XShj|jool~C|`v){Y6-h zlOARXH#yYP0TENPu*1sGE>q0FPOJc}KW#DE9xZZ~i^TanJK=NaKPf^1R@*8#i#uyi zRzR3~Jpf4uBYnSl0_j`NbFY1?e(CwD9(D*1_QI*C_oLV;J-Q!Q3zwoD$bI=M@^J3X z?!u96I=mybTm7xNl2+)`C)8lx8;#n2!mlxC$N=qfwFglK&+WqMna@3kcefl}MCgoD zKo6XdoH^CaS<<&Yk>tl{f8Ec9GGyl_YzYi~e1oeNmJ|#eysgR$F27UpmykI==hwk< z8oAyQD+JxeTPuFT-5~*p7iQ6WIMq$gPKPpmV274FA2;LstE1apZm#6MrE3emiSqZq zB|H=NcN?%<7hma<6mIh8?sqlSbcR|(9vDT39Xgt$!aZn}GhrRETdmTu+R?h7-fs7z z(=*Cmw6M=V!`hL=Lks)J56#T)pd{bl)eoOi5k%Gk$RU+m7}twLXFa+6i=0WoKEZ@8mzErukWcwPnb-T@-#m!-qS?s-^C zPHXWF=6Kfm-a)27g#0pg^Rp07F%i~sZc>fx1;ASMJ0|9>Ei+=@c3kYq1Cll~?&jM- zL&*cYgXXY~Q7Xw){TQX0R@5z`V-)l7hC@BQZBF5j(6@Vu))JgrY+|!ldtfpRrJww!6JK-)s>m9p{7QRl!LD_*5lDF zKHvLT9bbm1quiVf9q7Zw4PZI&*dJSX#XQLFr6G}n7pvbA7m5ppH$Z>9tFT$S6sJ&% zg#gF3`S1nDX29R-0e(R?=tA5hPvyB{tgr08;ixQjQXnE#EUTB(i86zNjMd?TED^P zwj=K|oOm0e4gFvWUn_lFbg=p#kik;XL(WpbhHX#?JHi~oMtAh?Hn zRxV3(H$cCLO%P}^b!lCGB39CanXB9G?OWdC3E?m7c&-=zp8Plqz)IKtgPEuO-*euS z++^WKfE9j26c)KZ{Y#StNAR{nN!2m^D$EC6$fCFgIH;U~cDaQ5ezW_vepRrw_$K!0 zi!nFUFN~(wj4hR_b)62aeh{vJm!>ebF{5_sm4!Ek7vWvTB-f|wBIt2Mp=`FrZUAcA zRnxZXo$%FKnXo4wE_TA8EiQN#bo^$R#+i_f{L^?mb|T!=vELi;tuK|)&D_j4>8ghQb&NV~mA`r7_0J zSXlT65GyJRVu%H?FoqZl3k%OT=l2rhNxt{H=giEVxpTicucxP@xubc|EXyKt#~56P zN1RQrQ?8S!%X!+>;=1WN=4x>Dxm0g;wY$nuqw}z<%vI_-=-TZ%kG$2wfMTy3tqu5QMf z&)<)wTpH^>L#|<$p5=r~??r3OtW$I7oXak~2VE!X9qQf(T%9hhwc4fgbzJi`ARX6O zYu5YKG0mBYQ3l`OcvcMN;P9fz7vSXL$p664*^$>T34Vm>oXF?kjip=%2XiBT2dkGw z{v7^&z`8Y&=M)5I zU}|mT({OBEd z!QXIdd*tOif?wb>ncSnkFoC^+>0OZ*6a}BcJel6(0nFVK`6Kup-rF1XQhhKU^YG68 z$afS6-@;Ov+v72ugSShf{t4Ep33yDyVSNl9voKXg4ky$gJQiTTK5&l@uuhHQpWrhv z>~GlKu)Se-!{&y)4O<&FHf(Fy)c&)lVL!umhTROi88$QQW!THGlVKmjE`~h}TNrjQ z>|ofyuz|7tNsrU=3wZ}@d(z`nVw;U^HMTYBgF4Q3COuL)+m`faZvy`(fN#9n^ zHY9ysx%!+!{Y#-9rl<(maaakfU^T3PwNQOlQ5Q&hu7>sG>aB`~ Date: Thu, 30 Jun 2022 14:54:05 +0200 Subject: [PATCH 172/297] Woops --- megatron/tokenizer/tokenizer.py | 6 ++---- tests/test_dataloaders.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 5281d106e..fcc3ed20d 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -380,10 +380,8 @@ def additional_special_tokens_ids(self): @property def bos_token_id(self): - """ Id of the beginning of sentence token in the vocabulary.""" - return self.tokenizer.bos_token_id + raise NotImplementedError("Missing ") @property def eos_token_id(self): - """ Id of the end of sentence token in the vocabulary.""" - return self.tokenizer.eos_token_id + raise NotImplementedError("Missing ") diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 7a9da1129..59ec078cd 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -5,7 +5,7 @@ from megatron import global_vars, get_tokenizer, initialize_megatron, get_args from megatron.data import mlm_dataset, mtf_dataset -from megatron.data.data_samplers import MegatronPackedRandomSampler, build_pretraining_data_loader +from megatron.data.data_samplers import build_pretraining_data_loader from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context From c93ed5ce33afda60a10ad88302d735b336596334 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:01:42 +0200 Subject: [PATCH 173/297] Fix tests --- megatron/arguments.py | 2 +- megatron/data/data_samplers.py | 3 +++ tests/test_dataloaders.py | 14 +++++++------- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 230bd4d65..b11a1a1ae 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -553,7 +553,7 @@ def _add_training_args(parser): 'please refer https://github.com/facebookresearch/bitsandbytes.', dest='use_bnb_optimizer') group.add_argument('--dataloader-type', type=str, default=None, - choices=['single', 'cyclic'], + choices=['single', 'cyclic', 'decoder_packed'], help='Single pass vs multiple pass data loader') group.add_argument('--cpu-optimizer', action='store_true', help='Run optimizer on CPU') diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 2dad2fd30..0e27e6280 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -22,6 +22,7 @@ from megatron import get_args from megatron import mpu +from megatron.data.mtf_dataset import MTFDataset def pack_samples(items, max_seq_len=2049): @@ -115,6 +116,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) elif args.dataloader_type == 'decoder_packed': + assert isinstance(dataset, MTFDataset) batch_sampler = MegatronDecoderPackedText2TextRandomSampler( sequence_length=args.seq_length + 1, dataset=dataset, @@ -132,6 +134,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): collate_fn = None if args.dataloader_type == 'decoder_packed': + assert isinstance(dataset, MTFDataset) collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1) # Torch dataloader. diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 59ec078cd..5f5d54bc6 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -9,7 +9,7 @@ from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context -def get_default_args(data_dir): +def get_default_args(): """return a dictionary with key as argument name and value as additional arguments""" return { # GPT_ARGS @@ -65,8 +65,8 @@ def setUp(self) -> None: ) def test_mlm_dataset(self): - command_args = get_default_args(f"{self.data_dir}/gpt2") - command_args["--data-path"] = f"{self.data_dir}/meg-gpt2-openwebtext_text_document" + command_args = get_default_args() + command_args["--data-path"] = f"{self.data_dir}/gpt2/meg-gpt2-openwebtext_text_document" command_args["--noise_density"] = "0.15" command_args["--mean_noise_span_length"] = "3" command_args["--vocab-extra-ids"] = "100" @@ -109,8 +109,8 @@ def test_mlm_dataset(self): self.assertEqual(sample["target_tokens"][-1], tokenizer.sep) def test_mtf_dataset(self): - command_args = get_default_args(f"{self.data_dir}/t0") - command_args["--data-path"] = f"{self.data_dir}/data/gpt2/ag_news_prompt" + command_args = get_default_args() + command_args["--data-path"] = f"{self.data_dir}/gpt2/ag_news_prompt" command_args["--dataloader-type"] = "decoder_packed" with patch('sys.argv', flatten_arguments(command_args)): @@ -138,8 +138,8 @@ def test_mtf_dataset(self): def test_mtf_packed_dataloader(self): - command_args = get_default_args(f"{self.data_dir}/t0") - command_args["--data-path"] = f"{self.data_dir}/data/gpt2/ag_news_prompt" + command_args = get_default_args() + command_args["--data-path"] = f"{self.data_dir}/gpt2/ag_news_prompt" command_args["--dataloader-type"] = "decoder_packed" with patch('sys.argv', flatten_arguments(command_args)): From 528f5d348823a7eccd2aa493d143bfd7175a36d9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:03:40 +0200 Subject: [PATCH 174/297] Rename testing files --- ...ment.bin => ag_news_prompt_targets_document.bin} | Bin ...ment.idx => ag_news_prompt_targets_document.idx} | Bin 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/data/gpt2/{ag_news_prompt_text_targets_document.bin => ag_news_prompt_targets_document.bin} (100%) rename tests/data/gpt2/{ag_news_prompt_text_targets_document.idx => ag_news_prompt_targets_document.idx} (100%) diff --git a/tests/data/gpt2/ag_news_prompt_text_targets_document.bin b/tests/data/gpt2/ag_news_prompt_targets_document.bin similarity index 100% rename from tests/data/gpt2/ag_news_prompt_text_targets_document.bin rename to tests/data/gpt2/ag_news_prompt_targets_document.bin diff --git a/tests/data/gpt2/ag_news_prompt_text_targets_document.idx b/tests/data/gpt2/ag_news_prompt_targets_document.idx similarity index 100% rename from tests/data/gpt2/ag_news_prompt_text_targets_document.idx rename to tests/data/gpt2/ag_news_prompt_targets_document.idx From 8bed302d5ccbe0c4042f80e914c3412d8e939207 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:20:11 +0200 Subject: [PATCH 175/297] Do in-place operations --- megatron/data/data_samplers.py | 78 +++++++++++++++++----------------- tests/test_dataloaders.py | 4 +- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 0e27e6280..6e37debdb 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -20,13 +20,15 @@ import numpy as np import torch -from megatron import get_args +from megatron import get_args, get_tokenizer from megatron import mpu from megatron.data.mtf_dataset import MTFDataset -def pack_samples(items, max_seq_len=2049): +def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int): """ + Greedily packs samples. + Items: [ { @@ -45,12 +47,13 @@ def pack_samples(items, max_seq_len=2049): decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]: `0` depicts inputs, `1` depicts target. """ - decoder_target_tokens = [[]] - decoder_segment_ids = [[]] - decoder_causal_attention = [[]] + decoder_target_tokens = np.zeros((micro_batch_size, max_seq_len)) + decoder_segment_ids = np.zeros((micro_batch_size, max_seq_len)) + decoder_causal_attention = np.zeros((micro_batch_size, max_seq_len)) batch_num = 0 - item_num = 0 + # `0` is reserved for padding + item_num = 1 cur_len = 0 for token_dict in items: input_token_len = len(token_dict["input_tokens"]) @@ -60,39 +63,31 @@ def pack_samples(items, max_seq_len=2049): len_diff = max_seq_len - cur_len # Padding if len_diff > 0: - decoder_target_tokens[batch_num].append(np.zeros((len_diff))) - decoder_segment_ids[batch_num].append(np.zeros((len_diff))) - decoder_causal_attention[batch_num].append(np.zeros((len_diff))) + decoder_target_tokens[batch_num][cur_len: max_seq_len] = pad_token + decoder_segment_ids[batch_num][cur_len: max_seq_len] = 0 + decoder_causal_attention[batch_num][cur_len: max_seq_len] = 0 batch_num += 1 - item_num = 0 + assert batch_num < micro_batch_size + item_num = 1 cur_len = 0 - decoder_target_tokens.append([]) - decoder_segment_ids.append([]) - decoder_causal_attention.append([]) - decoder_target_tokens[batch_num].append(token_dict["input_tokens"]) - decoder_target_tokens[batch_num].append(token_dict["target_tokens"]) - cur_len += total_len + decoder_target_tokens[batch_num][cur_len: cur_len + input_token_len] = token_dict["input_tokens"] + decoder_target_tokens[batch_num][cur_len + input_token_len: cur_len + total_len] = token_dict["target_tokens"] + decoder_segment_ids[batch_num][cur_len: cur_len + total_len] = item_num + decoder_causal_attention[batch_num][cur_len: cur_len + input_token_len] = 1 # input + decoder_causal_attention[batch_num][cur_len + input_token_len: cur_len + total_len] = 0 # target - decoder_segment_ids[batch_num].append(np.ones((total_len)) + item_num) - decoder_causal_attention[batch_num].append(np.ones((input_token_len))) - decoder_causal_attention[batch_num].append(np.zeros((target_token_len))) item_num += 1 - # Padding - len_diff = max_seq_len - cur_len - if len_diff > 0: - decoder_target_tokens[batch_num].append(np.zeros((len_diff))) - decoder_segment_ids[batch_num].append(np.zeros((len_diff))) - decoder_causal_attention[batch_num].append(np.zeros((len_diff))) + cur_len += total_len + assert cur_len < max_seq_len return { - "decoder_target_tokens": np.stack([np.concatenate(arr) for arr in decoder_target_tokens]), - "decoder_segment_ids": np.stack([np.concatenate(arr) for arr in decoder_segment_ids]), - "decoder_causal_attention": np.stack([np.concatenate(arr) for arr in decoder_causal_attention]), + "decoder_target_tokens": decoder_target_tokens, + "decoder_segment_ids": decoder_segment_ids, + "decoder_causal_attention": decoder_causal_attention, } - def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): """Buld dataloader given an input dataset.""" @@ -127,7 +122,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): data_parallel_size=mpu.get_data_parallel_world_size()) else: raise Exception('{} dataloader type is not supported.'.format( - args.dataloader_type)) + args.dataloader_type)) if num_workers is None: num_workers = args.num_workers @@ -135,14 +130,19 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): collate_fn = None if args.dataloader_type == 'decoder_packed': assert isinstance(dataset, MTFDataset) - collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1) + pad_token = get_tokenizer().pad + collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1, micro_batch_size=args.micro_batch_size, + pad_token=pad_token) # Torch dataloader. - return torch.utils.data.DataLoader(dataset, - batch_sampler=batch_sampler, - num_workers=num_workers, - collate_fn=collate_fn, - pin_memory=True) + return torch.utils.data.DataLoader( + dataset, + batch_sampler=batch_sampler, + num_workers=num_workers, + collate_fn=collate_fn, + pin_memory=True + ) + class MegatronPretrainingSampler: @@ -228,7 +228,7 @@ def __iter__(self): # data sharding and random sampling bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + * self.micro_batch_size bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size @@ -254,6 +254,7 @@ class MegatronDecoderPackedText2TextRandomSampler(object): To be used with `pack_samples` as collate_fn """ + def __init__(self, sequence_length, dataset, total_samples, consumed_samples, micro_batch_size, data_parallel_rank, data_parallel_size): # Keep a copy of input params for later use. @@ -289,7 +290,7 @@ def __iter__(self): # data sharding and random sampling bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + * self.micro_batch_size bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size @@ -317,4 +318,3 @@ def __iter__(self): else: token_lens += tok_len batch.append(idx) - diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 5f5d54bc6..5774ba617 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -172,8 +172,8 @@ def test_mtf_packed_dataloader(self): # `micro_batch_size` correspond to the one in argument self.assertEqual(micro_batch_size, args.micro_batch_size) - # `seq_length` correspond to the one in argument - self.assertEqual(seq_length, args.seq_length) + # `seq_length` correspond to the one in argument + 1 in order to get tokens/labels + self.assertEqual(seq_length, args.seq_length + 1) original_samples_count = 0 for batch_id in micro_batch_size: From bd2fede1391e83be096c511d2662116a2e26bf08 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:20:48 +0200 Subject: [PATCH 176/297] Do in-place operations --- megatron/data/data_samplers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 6e37debdb..7c3b7a6f4 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -47,7 +47,7 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]: `0` depicts inputs, `1` depicts target. """ - decoder_target_tokens = np.zeros((micro_batch_size, max_seq_len)) + decoder_target_tokens = np.full((micro_batch_size, max_seq_len), pad_token) decoder_segment_ids = np.zeros((micro_batch_size, max_seq_len)) decoder_causal_attention = np.zeros((micro_batch_size, max_seq_len)) From 8593e425236cabc7e90eca4fe1850d48ecbd2d8e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:21:50 +0200 Subject: [PATCH 177/297] Woops --- tests/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 5774ba617..e2da10264 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -176,7 +176,7 @@ def test_mtf_packed_dataloader(self): self.assertEqual(seq_length, args.seq_length + 1) original_samples_count = 0 - for batch_id in micro_batch_size: + for batch_id in range(micro_batch_size): segment_ids = [k for k, _ in itertools.grouby(items["decoder_segment_ids"][batch_id])] # `segment_ids` is [1,2,...] self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids)))) From a1eb558ae4b68e635dbf67c1f65257fd3f917bdf Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:22:30 +0200 Subject: [PATCH 178/297] Fix typo --- tests/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index e2da10264..5a226fbc5 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -177,7 +177,7 @@ def test_mtf_packed_dataloader(self): original_samples_count = 0 for batch_id in range(micro_batch_size): - segment_ids = [k for k, _ in itertools.grouby(items["decoder_segment_ids"][batch_id])] + segment_ids = [k for k, _ in itertools.groupby(items["decoder_segment_ids"][batch_id])] # `segment_ids` is [1,2,...] self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids)))) # `0` signify that the tokens are padding From 3bddafa8460965829a118d362328f6278e6f1b05 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:34:09 +0200 Subject: [PATCH 179/297] Add test that packing is done optimially via greedy algorithm --- tests/test_dataloaders.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 5a226fbc5..366fc5b30 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -167,6 +167,7 @@ def test_mtf_packed_dataloader(self): train_ds, consumed_samples=0, num_workers=4 ) + last_padding_size = 0 for i, items in enumerate(batch_sampler): micro_batch_size, seq_length = items["decoder_target_tokens"].shape @@ -184,4 +185,14 @@ def test_mtf_packed_dataloader(self): self.assertEqual(segment_ids[-1], 0) original_samples_count += len([segment_id for segment_id in segment_ids if segment_id != 0]) + # Test that we actually pack, ie we have more samples than the `batch_size` self.assertGreater(original_samples_count, micro_batch_size) + + # Test that the first sample of each batch couldn't fit inside the previous batch + first_sample_segment_ids = next(itertools.groupby(items["decoder_segment_ids"][0]))[1] + first_sample_size = len(first_sample_segment_ids) + self.assertGreater(first_sample_size, last_padding_size) + + # update `last_padding_size` + last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0]) + From 45c94446fdf9f5d66ea4ae063487f2fdfb79280e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 15:36:48 +0200 Subject: [PATCH 180/297] Woops --- tests/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 366fc5b30..bd627bae4 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -190,7 +190,7 @@ def test_mtf_packed_dataloader(self): # Test that the first sample of each batch couldn't fit inside the previous batch first_sample_segment_ids = next(itertools.groupby(items["decoder_segment_ids"][0]))[1] - first_sample_size = len(first_sample_segment_ids) + first_sample_size = len(list(first_sample_segment_ids)) self.assertGreater(first_sample_size, last_padding_size) # update `last_padding_size` From 6f28ae458643303e512e9bdb0903131ae802a2c6 Mon Sep 17 00:00:00 2001 From: lintangsutawika Date: Mon, 9 May 2022 08:00:24 +0000 Subject: [PATCH 181/297] added capabilities for padding and prefix lm index --- pretrain_mp3_gpt.py | 257 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 pretrain_mp3_gpt.py diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py new file mode 100644 index 000000000..8dccce361 --- /dev/null +++ b/pretrain_mp3_gpt.py @@ -0,0 +1,257 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain GPT""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + prefix_lm=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=True + ) + see_memory_usage(f"After Building Model", force=True) + return model + +_KEYS = ['text', 'prefix_len'] + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + print(data) + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].long() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = _KEYS + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + prefix_indices = data_b['prefix_len'].long() + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + # weight loss_mask + if args.reweight_loss_based_on_position_frequency: + reweight_loss_mask_(loss_mask, tokens) + + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator').start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + + # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 8a4d99b74b76591350c800ae986185bb748a03f0 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 9 May 2022 22:59:26 +0700 Subject: [PATCH 182/297] added adjustments and new dataset --- megatron/data/non_causal_mlm_dataset.py | 165 ++++++++++++++++++++++++ megatron/tokenizer/tokenizer.py | 19 +++ pretrain_mp3_gpt.py | 5 +- 3 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 megatron/data/non_causal_mlm_dataset.py diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py new file mode 100644 index 000000000..d5f435d37 --- /dev/null +++ b/megatron/data/non_causal_mlm_dataset.py @@ -0,0 +1,165 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""T5 Style dataset.""" + +import collections + +import numpy as np +import torch + +from megatron import get_tokenizer +from megatron.data.dataset_utils import ( + create_masked_lm_predictions, + get_samples_mapping +) + +class NonCausalMLMDataset(torch.utils.data.Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, masked_lm_prob, + max_seq_length, + short_seq_prob, seed): + + # Params to store. + self.name = name + self.seed = seed + self.masked_lm_prob = masked_lm_prob + self.max_seq_length = max_seq_length + + # Dataset. + self.indexed_dataset = indexed_dataset + + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + + # Vocab stuff. + tokenizer = get_tokenizer() + self.vocab_id_list = list(tokenizer.inv_vocab.keys()) + self.vocab_id_to_token_dict = tokenizer.inv_vocab + self.cls_id = tokenizer.cls + self.sep_id = tokenizer.sep + self.mask_id = tokenizer.mask + self.pad_id = tokenizer.pad + self.bos_id = tokenizer.bos_token_id + self.eos_id = tokenizer.eos_token_id + self.sentinel_tokens = tokenizer.additional_special_tokens_ids + assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" + + def __len__(self): + return self.samples_mapping.shape[0] + + def __getitem__(self, idx): + + start_index, end_index, seq_length = self.samples_mapping[idx] + sample = [] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) + # Note that this rng state should be numpy and not python since + # python randint is inclusive whereas the numpy one is exclusive. + np_rng = np.random.RandomState(seed=(self.seed + idx)) + return build_training_sample(sample, + self.max_seq_length, # needed for padding + self.vocab_id_list, + self.vocab_id_to_token_dict, + self.cls_id, self.sep_id, + self.mask_id, self.pad_id, + self.masked_lm_prob, np_rng, + self.bos_id, self.eos_id, + self.sentinel_tokens) + + +def build_training_sample(sample, + max_seq_length, + vocab_id_list, vocab_id_to_token_dict, + cls_id, sep_id, mask_id, pad_id, + masked_lm_prob, np_rng, bos_id=None, + eos_id=None, sentinel_tokens=None): + """Build training sample. + + Arguments: + sample: A list of sentences in which each sentence is a list token ids. + max_seq_length: Maximum length of the sequence. All values are padded to + this length. + vocab_id_list: List of vocabulary ids. Used to pick a random id. + vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. + cls_id: Start of example id. + sep_id: Separator id. + mask_id: Mask token id. + pad_id: Padding token id. + masked_lm_prob: Probability to mask tokens. + np_rng: Random number genenrator. Note that this rng state should be + numpy and not python since python randint is inclusive for + the opper bound whereas the numpy one is exclusive. + bos_id: start of decoder example id + eos_id: end of generation id + sentinel_tokens: unique value to be substituted for every replaced span + """ + + # flatten sentences into one list + tokens = [token for sentence in sample for token in sentence] + + # Truncate to `target_sequence_length`. + max_num_tokens = max_seq_length + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] + + # Masking. + max_predictions_per_seq = masked_lm_prob * max_num_tokens + (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( + tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng + ) + + # Padding. + padded_tokens = pad_and_convert_to_numpy(tokens, max_seq_length) + padded_labels = pad_and_convert_to_numpy(labels, max_seq_length) + padded_masks = pad_and_convert_to_numpy(masks, max_seq_length) + + print(padded_tokens) + print(padded_labels) + import sys + sys.exit() + + train_sample = { + 'text': padded_tokens, + 'labels': padded_labels, + 'mask': padded_masks, + 'prefix_len': 0 + } + return train_sample + + +def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): + """Pad sequences and convert them to numpy.""" + + # Some checks. + num_tokens = len(tokens) + padding_length = max_seq_length - num_tokens + assert padding_length >= 0 + + # Tokens and token types. + filler = np.array([pad_id] * padding_length) + tokens_np = np.concatenate((tokens, filler), dtype=np.int64) + + return tokens_np \ No newline at end of file diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index fcc3ed20d..f43338091 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -331,6 +331,9 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} + if 'mask_token' not in self.tokenizer.special_tokens_map: + tokenizer.mask_token = "" + @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens @@ -353,6 +356,22 @@ def tokenize(self, text): def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) + @property + def cls(self): + return self.cls_id + + @property + def sep(self): + return self.sep_id + + @property + def pad(self): + return self.pad_id + + @property + def mask(self): + return self.mask_id + @property def eod(self): return self.tokenizer.eos_token_id diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py index 8dccce361..4e61c184e 100644 --- a/pretrain_mp3_gpt.py +++ b/pretrain_mp3_gpt.py @@ -82,7 +82,6 @@ def get_batch(data_iterator): data = next(data_iterator) else: data = None - print(data) data_b = mpu.broadcast_data(keys, data, datatype) # Unpack. @@ -91,7 +90,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].long() + prefix_indices = data_b['prefix_len'].cpu().tolist() # Get the masks and postition ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -128,7 +127,7 @@ def get_batch_pipe(data): tokens = tokens_[:, :-1].contiguous() # Prefix - prefix_indices = data_b['prefix_len'].long() + prefix_indices = data_b['prefix_len'].cpu().tolist() # Get the masks and position ids. attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( From ea445b15c2a2d314465110636ff56c68c6cc5f72 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Sat, 21 May 2022 20:23:32 +0700 Subject: [PATCH 183/297] added sentinal tokens --- megatron/tokenizer/tokenizer.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index f43338091..472ad232f 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -334,6 +334,33 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): if 'mask_token' not in self.tokenizer.special_tokens_map: tokenizer.mask_token = "" + self.tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + '', + ] + }) + + @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens From 40708595eb0f1fd7a3189340764bdaf4e77bf588 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 23 May 2022 16:00:49 +0700 Subject: [PATCH 184/297] made into input and output tokens --- megatron/tokenizer/tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 472ad232f..48ac61f78 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -360,7 +360,6 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): ] }) - @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens From 85e84ecb27d69681f93f465a19491131dfbf174f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 19:31:45 +0700 Subject: [PATCH 185/297] modifying mlm to reflect original implementation --- 4B8-en-ND-MLM.sh | 156 +++++++++++++++++++ megatron/tokenizer/tokenizer.py | 49 +----- prepare_tokenizer.py | 16 ++ pretrain_mp3_gpt.py | 256 -------------------------------- 4 files changed, 176 insertions(+), 301 deletions(-) create mode 100644 4B8-en-ND-MLM.sh create mode 100644 prepare_tokenizer.py delete mode 100644 pretrain_mp3_gpt.py diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh new file mode 100644 index 000000000..c8e1ba0d6 --- /dev/null +++ b/4B8-en-ND-MLM.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +EXPERIMENT_NAME=4B8-en-ND-MLM +REPO_PATH=experiments/$EXPERIMENT_NAME +CHECKPOINT_PATH=$REPO_PATH/checkpoints +TENSORBOARD_PATH=$REPO_PATH/tensorboard +CODECARBON_PATH=$REPO_PATH/codecarbon +LOGS_PATH=$REPO_PATH/logs + +DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document +TOKENIZER_PATH=bigscience-tokenizer-padded + +# XXX: edit me +GPUS_PER_NODE=8 +NNODES=1 +PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here +TP_SIZE=1 # always fixed to the size of a single node +DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer + +MICRO_BATCH_SIZE=1 +GLOBAL_BATCH_SIZE=2048 +TRAIN_ITER=39_718 +INPUT_LEN=512 +TARGET_LEN=114 + +NLAYERS=24 +NHIDDEN=4096 +NHEADS=64 +FFN_HIDDEN_SIZE=10240 +MAX_POSITION_EMBEDDING=1280 + +SAVE_INTERVAL=1500 + +OPTIMIZER_ARGS=" \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr 2e-4 \ + --min-lr 1e-5 \ + --lr-decay-style cosine \ + --clip-grad 1.0 \ + --weight-decay 1e-1 \ + " + +EXIT_OPTS=" \ + --exit-duration-in-mins 1190 \ + " + +GPT_ARGS=" \ + --num-layers $NLAYERS \ + --hidden-size $NHIDDEN \ + --num-attention-heads $NHEADS \ + --ffn-hidden-size $FFN_HIDDEN_SIZE \ + --max-position-embeddings $SEQ_LEN \ + --position-embedding-type alibi \ + --encoder-seq-length $INPUT_LEN \ + --decoder-seq-length $TARGET_LEN \ + --micro-batch-size $MICRO_BATCH_SIZE \ + --global-batch-size $GLOBAL_BATCH_SIZE \ + --train-iters $TRAIN_ITER \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_PATH \ + --loss-scale 12 \ + --clip-grad 1.0 \ + --fp16 \ + --checkpoint-activations \ + $OPTIMIZER_ARGS \ + $EXIT_OPTS \ + " + +OUTPUT_ARGS=" \ + --log-interval 200 \ + --save-interval $SAVE_INTERVAL \ + --eval-interval $TRAIN_ITER \ + --eval-iters 1 \ + --tensorboard-dir $TENSORBOARD_PATH \ + --tensorboard-queue-size 5 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + " + +ZERO_STAGE=1 + +config_json="./ds_config.json" + +# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() +cat < $config_json +{ + "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, + "train_batch_size": $GLOBAL_BATCH_SIZE, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOT + + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config ${config_json} \ + --zero-stage ${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " + +# export LAUNCHER="python -u -m torch.distributed.launch \ +# --nproc_per_node $GPUS_PER_NODE \ +# " +# # --nnodes $NNODES \ +# # --master_addr $MASTER_ADDR \ +# # --master_port $MASTER_PORT \ + +export CMD=" \ + `pwd`/train_ND_MLM_gpt.py \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + $GPT_ARGS \ + $OUTPUT_ARGS \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + $DEEPSPEED_ARGS \ + " + + +# # clear old checkpoint as it'd mismatch while we sort things out +# rm -rf $SAVE_CHECKPOINT_PATH + + +echo $CMD + +# We create the folder where the logs and codecarbon will be stored. +mkdir -p $REPO_PATH +mkdir -p $LOGS_PATH +# to debug - add echo (it exits and prints what it would have launched) + +python -u -m torch.distributed.launch \ + --nproc_per_node $GPUS_PER_NODE \ + $CMD + +# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 48ac61f78..63b58f114 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -331,35 +331,6 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids): self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} - if 'mask_token' not in self.tokenizer.special_tokens_map: - tokenizer.mask_token = "" - - self.tokenizer.add_special_tokens({ - 'additional_special_tokens': [ - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - '', - ] - }) - @property def vocab_size(self): return len(self.tokenizer) # vocab_size doesn't contain additional tokens @@ -382,6 +353,10 @@ def tokenize(self, text): def detokenize(self, token_ids): return self.tokenizer.decode(token_ids) + @property + def eod(self): + return self.tokenizer.eos_token_id + @property def cls(self): return self.cls_id @@ -398,22 +373,6 @@ def pad(self): def mask(self): return self.mask_id - @property - def eod(self): - return self.tokenizer.eos_token_id - - @property - def cls(self): - return self.tokenizer.cls_token_id - - @property - def sep(self): - return self.tokenizer.sep_token_id - - @property - def pad(self): - return self.tokenizer.pad_token_id - @property def mask(self): return self.tokenizer.mask_token_id diff --git a/prepare_tokenizer.py b/prepare_tokenizer.py new file mode 100644 index 000000000..e058ac62a --- /dev/null +++ b/prepare_tokenizer.py @@ -0,0 +1,16 @@ +from transformers import AutoTokenizer, AddedToken + +tokenizer = AutoTokenizer.from_pretrained('bigscience/tokenizer') + +tokenizer.add_special_tokens({ + 'additional_special_tokens': [ + AddedToken( + ''.format(str(idx).zfill(3)), + lstrip=False, + rstrip=False, + normalization=False + ) for idx in reversed(range(0,200)) + ] + }) + +tokenizer.save_pretrained('bigscience-tokenizer-padded') \ No newline at end of file diff --git a/pretrain_mp3_gpt.py b/pretrain_mp3_gpt.py deleted file mode 100644 index 4e61c184e..000000000 --- a/pretrain_mp3_gpt.py +++ /dev/null @@ -1,256 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Pretrain GPT""" - -import torch -from functools import partial -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import get_tokenizer -from megatron import mpu -from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group -from megatron.model import GPTModel, GPTModelPipe -from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ -from megatron.utils import average_losses_across_data_parallel_group - -import deepspeed -from deepspeed.runtime.utils import see_memory_usage -import subprocess - -def model_provider(pre_process=True, post_process=True): - """Build the model.""" - - print_rank_0('building GPT model ...') - see_memory_usage(f"Before Building Model", force=True) - - args = get_args() - - with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, - config_dict_or_path=args.deepspeed_config, - enabled=args.zero_stage == 3, - mpu=mpu): - if args.deepspeed: - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - prefix_lm=True - ) - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe - - else: - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - prefix_lm=True - ) - see_memory_usage(f"After Building Model", force=True) - return model - -_KEYS = ['text', 'prefix_len'] - -def get_batch(data_iterator): - """Generate a batch""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - if data_iterator is not None: - data = next(data_iterator) - else: - data = None - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return tokens, labels, loss_mask, attention_mask, position_ids - -def get_batch_pipe(data): - """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" - args = get_args() - tokenizer = get_tokenizer() - - # Items and their type. - keys = _KEYS - datatype = torch.int64 - - # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) - - # Unpack. - tokens_ = data_b['text'].long() - labels = tokens_[:, 1:].contiguous() - tokens = tokens_[:, :-1].contiguous() - - # Prefix - prefix_indices = data_b['prefix_len'].cpu().tolist() - - # Get the masks and position ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( - tokens, - tokenizer.eod, - args.reset_position_ids, - args.reset_attention_mask, - args.eod_mask_loss, - prefix_indices=prefix_indices, - loss_on_targets_only=args.loss_on_targets_only - ) - - # weight loss_mask - if args.reweight_loss_based_on_position_frequency: - reweight_loss_mask_(loss_mask, tokens) - - return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices - -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - -def forward_step(data_iterator, model): - """Forward step.""" - args = get_args() - timers = get_timers() - - # Get the batch. - timers('batch-generator').start() - tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) - timers('batch-generator').stop() - - output_tensor = model(tokens, position_ids, attention_mask, - labels=labels) - - return output_tensor, partial(loss_func, loss_mask) - - -def train_valid_test_datasets_provider(train_val_test_num_samples): - """Build train, valid, and test datasets.""" - args = get_args() - train_ds, valid_ds, test_ds = None, None, None - - print_rank_0('> building train, validation, and test datasets for GPT ...') - # Option 1 of data loading using --data-path - - if args.data_path: - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, - seed=args.seed, - skip_warmup=(not args.mmap_warmup)) - - # Option 2 of data loading using --(train|valid|test)-weighted-split-paths - elif args.train_weighted_split_paths: - assigned_train_valid_test = [] - if args.train_weighted_split_paths is not None: - train_ds = [] - assigned_train_valid_test.append("train") - if args.valid_weighted_split_paths is not None: - valid_ds = [] - assigned_train_valid_test.append("valid") - if args.test_weighted_split_paths is not None: - test_ds = [] - assigned_train_valid_test.append("test") - - for s in assigned_train_valid_test: - data_groups = zip(eval(f"args.{s}_weighted_split_paths"), - eval(f"args.{s}_weighted_split_weights"), - eval(f"args.{s}_weighted_split_splits"), - eval(f"args.{s}_weighted_split_names")) - for paths, weights, splits, name in data_groups: - d = build_dataset_group(name, paths, weights, splits, - args.data_impl, - train_val_test_num_samples, - args.seq_length, args.seed, - (not args.mmap_warmup), - train_valid_test=s) - eval(f"{s}_ds").append(d) - else: - raise NotImplementedError("No dataloading argument passed") - - print_rank_0("> finished creating GPT datasets ...") - return train_ds, valid_ds, test_ds - -def command_exists(cmd): - result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) - return result.wait() == 0 - -def git_ds_info(): - from deepspeed.env_report import main as ds_report - ds_report() - - # Write out version/git info - git_hash_cmd = "git rev-parse --short HEAD" - git_branch_cmd = "git rev-parse --abbrev-ref HEAD" - if command_exists('git'): - try: - result = subprocess.check_output(git_hash_cmd, shell=True) - git_hash = result.decode('utf-8').strip() - result = subprocess.check_output(git_branch_cmd, shell=True) - git_branch = result.decode('utf-8').strip() - except subprocess.CalledProcessError: - git_hash = "unknown" - git_branch = "unknown" - else: - git_hash = "unknown" - git_branch = "unknown" - print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') - - -if __name__ == "__main__": - git_ds_info() - pretrain(train_valid_test_datasets_provider, model_provider, forward_step, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 39222938bdfb65762d2845b65cbe6a408e8c4c9b Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 2 Jun 2022 23:34:37 +0700 Subject: [PATCH 186/297] minor fix --- megatron/tokenizer/tokenizer.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 63b58f114..fcc3ed20d 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -359,19 +359,15 @@ def eod(self): @property def cls(self): - return self.cls_id + return self.tokenizer.cls_token_id @property def sep(self): - return self.sep_id + return self.tokenizer.sep_token_id @property def pad(self): - return self.pad_id - - @property - def mask(self): - return self.mask_id + return self.tokenizer.pad_token_id @property def mask(self): From ee6438f12d561ea1b18ce42ba45362c91fc3e75f Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Thu, 23 Jun 2022 17:05:59 +0700 Subject: [PATCH 187/297] added sampler and test --- megatron/data/data_samplers.py | 67 +++++++++++++++++++++++++++++++- tests/test_packing_dataloader.py | 37 ++++++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 tests/test_packing_dataloader.py diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 7c3b7a6f4..eb33bfe68 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -40,7 +40,7 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) 'target_tokens': array([5]) } ] - + Output: decoder_target_tokens = [[6, 7, 8, 3, 4, 5, ]]: Concatenation of tokens followed with padding tokens. decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]: Segment ids determine original documents. @@ -120,6 +120,14 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) + elif args.dataloader_type == 'packed': + batch_sampler = MegatronPackedRandomSampler( + sequence_length=args.seq_length, + total_samples=len(dataset), + consumed_samples=consumed_samples, + micro_batch_size=args.micro_batch_size, + data_parallel_rank=mpu.get_data_parallel_rank(), + data_parallel_size=mpu.get_data_parallel_world_size()) else: raise Exception('{} dataloader type is not supported.'.format( args.dataloader_type)) @@ -143,7 +151,6 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): pin_memory=True ) - class MegatronPretrainingSampler: def __init__(self, total_samples, consumed_samples, micro_batch_size, @@ -220,6 +227,61 @@ def __init__(self, total_samples, consumed_samples, micro_batch_size, def __len__(self): return self.total_samples + def __iter__(self): + active_total_samples = self.total_samples - self.last_batch_size + self.epoch = self.consumed_samples // active_total_samples + current_epoch_samples = self.consumed_samples % active_total_samples + assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 + + # data sharding and random sampling + bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ + * self.micro_batch_size + bucket_offset = current_epoch_samples // self.data_parallel_size + start_idx = self.data_parallel_rank * bucket_size + + g = torch.Generator() + g.manual_seed(self.epoch) + random_idx = torch.randperm(bucket_size, generator=g).tolist() + idx_range = [start_idx + x for x in random_idx[bucket_offset:]] + + batch = [] + # Last batch if not complete will be dropped. + for idx in idx_range: + batch.append(idx) + if len(batch) == self.micro_batch_size: + self.consumed_samples += self.micro_batch_times_data_parallel_size + yield batch + batch = [] + + +class MegatronPackedRandomSampler(object): + """docstring for MegatronPackedRandomSampler""" + def __init__(self, sequence_length, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size): + # Keep a copy of input params for later use. + self.sequence_length = sequence_length + self.total_samples = total_samples + self.consumed_samples = consumed_samples + self.micro_batch_size = micro_batch_size + self.data_parallel_rank = data_parallel_rank + self.data_parallel_size = data_parallel_size + self.micro_batch_times_data_parallel_size = \ + self.micro_batch_size * data_parallel_size + self.last_batch_size = \ + self.total_samples % self.micro_batch_times_data_parallel_size + + # Sanity checks. + assert self.total_samples > 0, \ + 'no sample to consume: {}'.format(self.total_samples) + assert self.micro_batch_size > 0 + assert data_parallel_size > 0 + assert self.data_parallel_rank < data_parallel_size, \ + 'data_parallel_rank should be smaller than data size: {}, ' \ + '{}'.format(self.data_parallel_rank, data_parallel_size) + + def __len__(self): + return self.total_samples + def __iter__(self): active_total_samples = self.total_samples - self.last_batch_size self.epoch = self.consumed_samples // active_total_samples @@ -234,6 +296,7 @@ def __iter__(self): g = torch.Generator() g.manual_seed(self.epoch) + random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py new file mode 100644 index 000000000..d5af66d62 --- /dev/null +++ b/tests/test_packing_dataloader.py @@ -0,0 +1,37 @@ +import os +import torch.distributed as dist + +from megatron.initialize import initialize_megatron +# from megatron.data.data_samplers import MegatronPackedRandomSampler +from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group + +#Initialize Megatron with dummy variables +initialize_megatron( + extra_args_provider=None, + args_defaults={ + "micro_batch_size": 4, + "num_layers": 4, + "hidden_size": 64, + "num_attention_heads": 4, + "seq_length": 256, + "max_position_embeddings": 256, + } + ) + +train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=["tests/data/gpt2/meg-gpt2-openwebtext_text_document"], + data_impl="mmap", + splits_string="90,5,5", + train_valid_test_num_samples=[100,100,100], + seq_length=1024, + seed=124, + skip_warmup=True + ) + +dl = torch.utils.data.DataLoader( + train_ds, + batch_size=4, + # batch_sampler=batch_sampler, + num_workers=4, + pin_memory=True + ) From a869adf56040d4a5c30735b804e1abf01b9ab9bb Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Wed, 29 Jun 2022 19:22:21 +0200 Subject: [PATCH 188/297] Enable training --- examples/finetune_t0.sh | 42 +++++++++ finetune_t0.py | 159 +++++++++++++++++++++++++++++++++ megatron/data/data_samplers.py | 1 + megatron/training.py | 2 +- 4 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 examples/finetune_t0.sh create mode 100644 finetune_t0.py diff --git a/examples/finetune_t0.sh b/examples/finetune_t0.sh new file mode 100644 index 000000000..a6a1dc600 --- /dev/null +++ b/examples/finetune_t0.sh @@ -0,0 +1,42 @@ +#! /bin/bash + +# Runs the "345M" parameter model + +RANK=0 +WORLD_SIZE=1 + +DATA_PATH=tests/data/t0/ag_news_prompt_inputs_document tests/data/t0/ag_news_prompt_targets_document + +CHECKPOINT_PATH="./checkpoints" +TOKENIZER_PATH=gpt2 + +deepspeed --num_gpus 1 pretrain_t0.py \ + --num-layers 24 \ + --hidden-size 1024 \ + --num-attention-heads 16 \ + --micro-batch-size 4 \ + --global-batch-size 8 \ + --seq-length 1024 \ + --max-position-embeddings 1024 \ + --train-iters 500000 \ + --lr-decay-iters 320000 \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --data-path $DATA_PATH \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_PATH \ + --data-impl mmap \ + --split 949,50,1 \ + --distributed-backend nccl \ + --lr 0.00015 \ + --min-lr 1.0e-5 \ + --lr-decay-style cosine \ + --weight-decay 1e-2 \ + --clip-grad 1.0 \ + --lr-warmup-fraction .01 \ + --checkpoint-activations \ + --log-interval 100 \ + --save-interval 10000 \ + --eval-interval 1000 \ + --eval-iters 10 \ + --fp16 diff --git a/finetune_t0.py b/finetune_t0.py new file mode 100644 index 000000000..7607fa3eb --- /dev/null +++ b/finetune_t0.py @@ -0,0 +1,159 @@ +"""Multitask Finetuning T0""" + +from multiprocessing.sharedctypes import Value +import torch + +from megatron import get_args, get_tokenizer, print_rank_0, mpu +from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.model import GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import os + +try: + from torch.distributed.elastic.multiprocessing.errors import record +except ImportError: + # noop + def record(fn): + return fn + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe_packed + else: + raise NotImplementedError("DeepSpeed is required for T0") + + see_memory_usage(f"After Building Model", force=True) + return model + +def get_batch_pipe_packed(data): + """ + Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator` & in packed fashion + + data: + decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]] + decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]] + decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]] + """ + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['decoder_target_tokens', 'decoder_segment_ids', 'decoder_causal_attention'] + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['decoder_target_tokens'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + segment_ids = data_b['decoder_segment_ids'].long()[:, :-1] + decoder_causal_attention = data_b['decoder_causal_attention'].long()[:, :-1] + + # Get the masks and position ids. + causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss, + prefix_indices=None, + loss_on_targets_only=False # This is done below + ) + # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding + loss_mask *= torch.logical_and((decoder_causal_attention - 1) * -1, tokens) + loss_mask = loss_mask.to(datatype) + + attention_mask = get_packed_attention_mask( + causal_mask=causal_mask, + tokens=tokens, + decoder_causal_attention=decoder_causal_attention, + segment_ids=segment_ids, + datatype=datatype, + ) + + if args.curriculum_learning and args.curriculum_seqlen < tokens.size()[1]: + # seqlen-based curriculum learning + # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] + tokens = tokens[:, :args.curriculum_seqlen].contiguous() + position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() + labels = labels[:, :args.curriculum_seqlen].contiguous() + loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def loss_func(loss_mask, output_tensor): + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for T0 ...') + # Option 1 of data loading using --data-path + # For T0, data has to be provided in the form --data-path input-data target-data input-data2 target-data2 ... + if args.data_path: + + # Turn into list of pairs; Overwrite args.data_path to keep len = 1 + # TODO: Not yet compatible with dataset weights (Will break at prefixes, weights = analyze_data_prefix(args.data_path)) + assert len(args.data_path) > 1, "Please provide data in pairs of two: input_tokens target_tokens" + args.data_path = [{"input_tokens": args.data_path[i], "target_tokens": args.data_path[i+1]} for i in range(0, len(args.data_path), 2)] + + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating T0 datasets ...") + return train_ds, valid_ds, test_ds + +@record +def main(): + pretrain(train_valid_test_datasets_provider, + model_provider, + forward_step_func=None, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) + +if __name__ == "__main__": + main() diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index eb33bfe68..5b636cb1b 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -81,6 +81,7 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) cur_len += total_len assert cur_len < max_seq_len + # Normally the default collate_fn handles torch tensor conversion; As we use a custom collate_fn, do it here return { "decoder_target_tokens": decoder_target_tokens, "decoder_segment_ids": decoder_segment_ids, diff --git a/megatron/training.py b/megatron/training.py index bbf6623e3..eeceec6ec 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1178,7 +1178,7 @@ def build_train_valid_test_data_iterators( # Build iterators. dl_type = args.dataloader_type - assert dl_type in ['single', 'cyclic'] + assert dl_type in ['single', 'cyclic', 'packed'] if train_dataloader is not None: train_data_iterator = iter(train_dataloader) if dl_type == 'single' \ From 5ae15ef6a03d90d6dbe241880405487e86ed16b5 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 30 Jun 2022 12:06:44 +0200 Subject: [PATCH 189/297] Add T0 training test --- megatron/data/data_samplers.py | 4 ++ megatron/training.py | 8 ++- megatron/utils.py | 59 +++++++++++++++++ tests/test_training.py | 115 +++++++++++++++++++++++++++++++++ 4 files changed, 183 insertions(+), 3 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 5b636cb1b..638aa455a 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -16,6 +16,7 @@ """Dataloaders.""" from functools import partial +import logging import numpy as np import torch @@ -24,6 +25,8 @@ from megatron import mpu from megatron.data.mtf_dataset import MTFDataset +logger = logging.get_logger(__name__) + def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int): """ @@ -61,6 +64,7 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) total_len = input_token_len + target_token_len if cur_len + total_len > max_seq_len: len_diff = max_seq_len - cur_len + logger.info(f"Loosing {len_diff} tokens to padding.") # Padding if len_diff > 0: decoder_target_tokens[batch_num][cur_len: max_seq_len] = pad_token diff --git a/megatron/training.py b/megatron/training.py index eeceec6ec..c74cccedf 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -988,6 +988,8 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): if args.deepspeed: # DeepSpeed uses eval_batch() and already aggregates losses. assert isinstance(model, list) and len(model) == 1 + print("DITER", data_iterator) + print("DITERNXT", next(iter(data_iterator))) loss = model[0].eval_batch(data_iterator) loss_dicts = [{'lm loss' : loss}] * get_num_microbatches() else: @@ -1181,20 +1183,20 @@ def build_train_valid_test_data_iterators( assert dl_type in ['single', 'cyclic', 'packed'] if train_dataloader is not None: - train_data_iterator = iter(train_dataloader) if dl_type == 'single' \ + train_data_iterator = iter(train_dataloader) if dl_type in ['single', 'packed'] \ else iter(cyclic_iter(train_dataloader)) else: train_data_iterator = None if valid_dataloaders is not None: - valid_data_iterators = [iter(vdl) if dl_type == 'single' \ + valid_data_iterators = [iter(vdl) if dl_type in ['single', 'packed'] \ else iter(cyclic_iter(valid_dataloaders)) for vdl in valid_dataloaders] else: valid_data_iterators = [None] * num_valid_ds if test_dataloaders is not None: - test_data_iterators = [iter(tdl) if dl_type == 'single' \ + test_data_iterators = [iter(tdl) if dl_type in ['single', 'packed'] \ else iter(cyclic_iter(test_dataloaders)) for tdl in test_dataloaders] else: diff --git a/megatron/utils.py b/megatron/utils.py index 98d2f611c..e57079e6f 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -250,6 +250,65 @@ def get_ltor_masks_and_position_ids( return attention_mask, loss_mask, position_ids +def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, segment_ids, datatype=torch.int64): + """ + Inspired by https://github.com/google-research/t5x/blob/7193407f98a8b18100b71a04ff777238be1682ca/t5x/examples/decoder_only/layers.py#L978 + """ + inputs_mask = decoder_causal_attention.unsqueeze(-1) * decoder_causal_attention.unsqueeze(1) + inputs_mask = inputs_mask.unsqueeze(1) + + """Causal Inputs Mask: + mask = [[[[1, 1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 0, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 1]]]] + """ + causal_inputs_mask = torch.logical_or(causal_mask, inputs_mask).to(datatype) + + """Padding Mask: + mask = [[[[1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [1, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0]]]] + """ + padding_mask = (tokens > 0).unsqueeze(-1) * (tokens > 0).unsqueeze(1) + padding_mask = padding_mask.unsqueeze(1) + + + """Segment Mask: + mask = [[[[1, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0]]]] + """ + segment_mask = (segment_ids.unsqueeze(-1)) == (segment_ids.unsqueeze(1)) + segment_mask = segment_mask.unsqueeze(1) + + """Final Mask: + mask = [[[[1, 1, 0, 0, 0, 0, 0], + [1, 1, 0, 0, 0, 0, 0], + [1, 1, 1, 0, 0, 0, 0], + [0, 0, 0, 1, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 0], + [0, 0, 0, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0]]]] + """ + attention_mask = causal_inputs_mask * padding_mask * segment_mask + + # Convert attention mask to binary: + attention_mask = (attention_mask < 0.5) + + return attention_mask + def param_size(parameter): return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement() diff --git a/tests/test_training.py b/tests/test_training.py index c77cb9af2..505835804 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -468,6 +468,121 @@ def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_ tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*") self.assertEqual(len(tensorboard_files), 2, "tensorboard files") + def test_training_t0(self): + # all in one test + src_dir = self.src_dir + data_dir = f"{self.data_dir}/gpt2" + data_dir_t0 = f"{self.data_dir}/t0" + output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False) + logs_dir = f"{output_dir}/logs" + Path(logs_dir).mkdir(parents=True, exist_ok=True) + + pp_size, tp_size, dp_size = get_3d_dimensions() + num_gpus = pp_size * tp_size * dp_size + + n_samples = 200 # about 37 iterations + exit_interval = 10 # some samples in the first half and then some more in the 2nd half after resume + + args = f""" + --tensor-model-parallel-size {tp_size} + --pipeline-model-parallel-size {pp_size} + --distributed-backend nccl + + --num-layers 2 + --hidden-size 64 + --num-attention-heads 2 + --seq-length 128 + --max-position-embeddings 1024 + --micro-batch-size 1 + --rampup-batch-size 2 2 {n_samples} + --global-batch-size 16 + --train-samples {n_samples} + + --optimizer adam + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --lr 1e-4 + --lr-warmup-samples 5 + --clip-grad 1.0 + --weight-decay 1e-1 + --fp16 + + --log-interval 5 + --save-interval 10 + --eval-interval 10 + --eval-iters 5 + --checkpoint-activations + --exit-interval {exit_interval} + + --merge-file {data_dir}/gpt2-tiny-merges.txt + --vocab-file {data_dir}/gpt2-tiny-vocab.json + --log-path {logs_dir} + --save {output_dir}/checkpoints + --load {output_dir}/checkpoints + --data-path {data_dir_t0}/ag_news_prompt_test_inputs_document {data_dir_t0}/ag_news_prompt_test_targets_document + --dataloader-type packed + --split 90,10,0 + --tensorboard-dir {output_dir}/tensorboard + --tensorboard-queue-size 5 + --log-timers-to-tensorboard + --log-batch-size-to-tensorboard + --log-validation-ppl-to-tensorboard + + --log-level debug + """.split() + + ds_args = f""" + --deepspeed + --deepspeed_config {self.test_file_dir_str}/ds_config.json + --zero-stage 1 + --deepspeed-activation-checkpointing + """.split() + + script = [f"{src_dir}/finetune_t0.py"] + launcher = get_launcher(num_gpus) + + cmd = launcher + script + args + ds_args + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + + # 1. test training from scratch (no checkpoint) + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test deepspeed is running + self.assertIn("DeepSpeed info", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) + + # test there should be no checkpoint this round + self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out) + + # test checkpoint saving + self.assertIn("successfully saved checkpoint at iteration", cs.out) + + # test tensorboard + tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*") + self.assertEqual(len(tensorboard_files), 1, "tensorboard files") + + # 2. test training from checkpoint: resume + # now do it again, this time resuming from the checkpoint + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test checkpoint loading + self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) + + # test checkpoint saving + self.assertIn("successfully saved checkpoint at iteration", cs.out) + + # test tensorboard (1 file from the first run, plus 1 now) + tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*") + self.assertEqual(len(tensorboard_files), 2, "tensorboard files") @parameterized.expand(["gpt", "prefix", "no_eval"]) def test_mode2_dataloading(self, variation): src_dir = self.src_dir From efa55ea819665a83a63f0dab1ae1c97b86c090e7 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 30 Jun 2022 12:08:00 +0200 Subject: [PATCH 190/297] Remove artefacts --- megatron/training.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index c74cccedf..9f006bc30 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -988,8 +988,6 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False): if args.deepspeed: # DeepSpeed uses eval_batch() and already aggregates losses. assert isinstance(model, list) and len(model) == 1 - print("DITER", data_iterator) - print("DITERNXT", next(iter(data_iterator))) loss = model[0].eval_batch(data_iterator) loss_dicts = [{'lm loss' : loss}] * get_num_microbatches() else: From f45266d1b3f376852af267bf3b13c92f9d641bcb Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 30 Jun 2022 12:12:57 +0200 Subject: [PATCH 191/297] Remove artefacts --- tests/test_training.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index 505835804..f19feee6f 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -469,11 +469,11 @@ def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_ self.assertEqual(len(tensorboard_files), 2, "tensorboard files") def test_training_t0(self): - # all in one test + src_dir = self.src_dir data_dir = f"{self.data_dir}/gpt2" data_dir_t0 = f"{self.data_dir}/t0" - output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False) + output_dir = self.get_auto_remove_tmp_dir() logs_dir = f"{output_dir}/logs" Path(logs_dir).mkdir(parents=True, exist_ok=True) From 8029564f469529e45a0a94e9a8c18e766cad93df Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 30 Jun 2022 18:16:22 +0200 Subject: [PATCH 192/297] WIP --- 4B8-en-ND-MLM.sh | 156 ------------------ examples/finetune_t0.sh | 42 ----- ...t0.py => finetune_t0_non_causal_decoder.py | 75 +++------ megatron/data/data_samplers.py | 58 ++----- megatron/training.py | 8 +- megatron/utils.py | 30 ++-- prepare_tokenizer.py | 16 -- tests/test_dataloaders.py | 2 +- tests/test_packing_dataloader.py | 37 ----- tests/test_training.py | 17 +- 10 files changed, 71 insertions(+), 370 deletions(-) delete mode 100644 4B8-en-ND-MLM.sh delete mode 100644 examples/finetune_t0.sh rename finetune_t0.py => finetune_t0_non_causal_decoder.py (61%) delete mode 100644 prepare_tokenizer.py delete mode 100644 tests/test_packing_dataloader.py diff --git a/4B8-en-ND-MLM.sh b/4B8-en-ND-MLM.sh deleted file mode 100644 index c8e1ba0d6..000000000 --- a/4B8-en-ND-MLM.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash - -EXPERIMENT_NAME=4B8-en-ND-MLM -REPO_PATH=experiments/$EXPERIMENT_NAME -CHECKPOINT_PATH=$REPO_PATH/checkpoints -TENSORBOARD_PATH=$REPO_PATH/tensorboard -CODECARBON_PATH=$REPO_PATH/codecarbon -LOGS_PATH=$REPO_PATH/logs - -DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document -TOKENIZER_PATH=bigscience-tokenizer-padded - -# XXX: edit me -GPUS_PER_NODE=8 -NNODES=1 -PP_SIZE=2 # NLAYERS must be a multiple of PP_SIZE here -TP_SIZE=1 # always fixed to the size of a single node -DP_SIZE=$((NNODES*GPUS_PER_NODE/(PP_SIZE*TP_SIZE))) # will get derived automatically by trainer - -MICRO_BATCH_SIZE=1 -GLOBAL_BATCH_SIZE=2048 -TRAIN_ITER=39_718 -INPUT_LEN=512 -TARGET_LEN=114 - -NLAYERS=24 -NHIDDEN=4096 -NHEADS=64 -FFN_HIDDEN_SIZE=10240 -MAX_POSITION_EMBEDDING=1280 - -SAVE_INTERVAL=1500 - -OPTIMIZER_ARGS=" \ - --optimizer adam \ - --adam-beta1 0.9 \ - --adam-beta2 0.999 \ - --adam-eps 1e-8 \ - --lr 2e-4 \ - --min-lr 1e-5 \ - --lr-decay-style cosine \ - --clip-grad 1.0 \ - --weight-decay 1e-1 \ - " - -EXIT_OPTS=" \ - --exit-duration-in-mins 1190 \ - " - -GPT_ARGS=" \ - --num-layers $NLAYERS \ - --hidden-size $NHIDDEN \ - --num-attention-heads $NHEADS \ - --ffn-hidden-size $FFN_HIDDEN_SIZE \ - --max-position-embeddings $SEQ_LEN \ - --position-embedding-type alibi \ - --encoder-seq-length $INPUT_LEN \ - --decoder-seq-length $TARGET_LEN \ - --micro-batch-size $MICRO_BATCH_SIZE \ - --global-batch-size $GLOBAL_BATCH_SIZE \ - --train-iters $TRAIN_ITER \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path $TOKENIZER_PATH \ - --loss-scale 12 \ - --clip-grad 1.0 \ - --fp16 \ - --checkpoint-activations \ - $OPTIMIZER_ARGS \ - $EXIT_OPTS \ - " - -OUTPUT_ARGS=" \ - --log-interval 200 \ - --save-interval $SAVE_INTERVAL \ - --eval-interval $TRAIN_ITER \ - --eval-iters 1 \ - --tensorboard-dir $TENSORBOARD_PATH \ - --tensorboard-queue-size 5 \ - --log-timers-to-tensorboard \ - --log-batch-size-to-tensorboard \ - --log-validation-ppl-to-tensorboard \ - " - -ZERO_STAGE=1 - -config_json="./ds_config.json" - -# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() -cat < $config_json -{ - "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, - "train_batch_size": $GLOBAL_BATCH_SIZE, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": $ZERO_STAGE - }, - "fp16": { - "enabled": true, - "loss_scale": 0, - "loss_scale_window": 500, - "hysteresis": 2, - "min_loss_scale": 1, - "initial_scale_power": 12 - }, - "steps_per_print": 2000, - "wall_clock_breakdown": false -} -EOT - - -DEEPSPEED_ARGS=" \ - --deepspeed \ - --deepspeed_config ${config_json} \ - --zero-stage ${ZERO_STAGE} \ - --deepspeed-activation-checkpointing \ - " - -# export LAUNCHER="python -u -m torch.distributed.launch \ -# --nproc_per_node $GPUS_PER_NODE \ -# " -# # --nnodes $NNODES \ -# # --master_addr $MASTER_ADDR \ -# # --master_port $MASTER_PORT \ - -export CMD=" \ - `pwd`/train_ND_MLM_gpt.py \ - --tensor-model-parallel-size $TP_SIZE \ - --pipeline-model-parallel-size $PP_SIZE \ - $GPT_ARGS \ - $OUTPUT_ARGS \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - $DEEPSPEED_ARGS \ - " - - -# # clear old checkpoint as it'd mismatch while we sort things out -# rm -rf $SAVE_CHECKPOINT_PATH - - -echo $CMD - -# We create the folder where the logs and codecarbon will be stored. -mkdir -p $REPO_PATH -mkdir -p $LOGS_PATH -# to debug - add echo (it exits and prints what it would have launched) - -python -u -m torch.distributed.launch \ - --nproc_per_node $GPUS_PER_NODE \ - $CMD - -# srun '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 | tee -a $LOGS_PATH/main_log.txt \ No newline at end of file diff --git a/examples/finetune_t0.sh b/examples/finetune_t0.sh deleted file mode 100644 index a6a1dc600..000000000 --- a/examples/finetune_t0.sh +++ /dev/null @@ -1,42 +0,0 @@ -#! /bin/bash - -# Runs the "345M" parameter model - -RANK=0 -WORLD_SIZE=1 - -DATA_PATH=tests/data/t0/ag_news_prompt_inputs_document tests/data/t0/ag_news_prompt_targets_document - -CHECKPOINT_PATH="./checkpoints" -TOKENIZER_PATH=gpt2 - -deepspeed --num_gpus 1 pretrain_t0.py \ - --num-layers 24 \ - --hidden-size 1024 \ - --num-attention-heads 16 \ - --micro-batch-size 4 \ - --global-batch-size 8 \ - --seq-length 1024 \ - --max-position-embeddings 1024 \ - --train-iters 500000 \ - --lr-decay-iters 320000 \ - --save $CHECKPOINT_PATH \ - --load $CHECKPOINT_PATH \ - --data-path $DATA_PATH \ - --tokenizer-type PretrainedFromHF \ - --tokenizer-name-or-path $TOKENIZER_PATH \ - --data-impl mmap \ - --split 949,50,1 \ - --distributed-backend nccl \ - --lr 0.00015 \ - --min-lr 1.0e-5 \ - --lr-decay-style cosine \ - --weight-decay 1e-2 \ - --clip-grad 1.0 \ - --lr-warmup-fraction .01 \ - --checkpoint-activations \ - --log-interval 100 \ - --save-interval 10000 \ - --eval-interval 1000 \ - --eval-iters 10 \ - --fp16 diff --git a/finetune_t0.py b/finetune_t0_non_causal_decoder.py similarity index 61% rename from finetune_t0.py rename to finetune_t0_non_causal_decoder.py index 7607fa3eb..66bf43054 100644 --- a/finetune_t0.py +++ b/finetune_t0_non_causal_decoder.py @@ -4,15 +4,14 @@ import torch from megatron import get_args, get_tokenizer, print_rank_0, mpu -from megatron.data.non_causal_mtf_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.data.mtf_dataset import build_train_valid_test_datasets +from megatron.enums import PositionEmbeddingType from megatron.model import GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask -from megatron.utils import average_losses_across_data_parallel_group import deepspeed from deepspeed.runtime.utils import see_memory_usage -import os try: from torch.distributed.elastic.multiprocessing.errors import record @@ -41,39 +40,36 @@ def model_provider(pre_process=True, post_process=True): ) # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe_packed + model._megatron_batch_fn = get_batch_pipe else: raise NotImplementedError("DeepSpeed is required for T0") see_memory_usage(f"After Building Model", force=True) return model -def get_batch_pipe_packed(data): +def get_batch_pipe(data): """ Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator` & in packed fashion data: - decoder_target_tokens = [[6, 7, 8, 3, 4, 5, 0]] + decoder_tokens = [[6, 7, 8, 3, 4, 5, 0]] decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]] - decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]] + decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]] """ args = get_args() tokenizer = get_tokenizer() - # Items and their type. - keys = ['decoder_target_tokens', 'decoder_segment_ids', 'decoder_causal_attention'] - datatype = torch.int64 - # Broadcast data. - data_b = mpu.broadcast_data(keys, data, datatype) + data_b = mpu.broadcast_data(['decoder_tokens', 'decoder_segment_ids'], data, torch.int64) + data_c = mpu.broadcast_data(['decoder_is_inputs'], data, torch.bool) # Unpack. - tokens_ = data_b['decoder_target_tokens'].long() + tokens_ = data_b['decoder_tokens'].long() labels = tokens_[:, 1:].contiguous() tokens = tokens_[:, :-1].contiguous() segment_ids = data_b['decoder_segment_ids'].long()[:, :-1] - decoder_causal_attention = data_b['decoder_causal_attention'].long()[:, :-1] + decoder_is_inputs = data_c['decoder_is_inputs'][:, :-1] # Get the masks and position ids. causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -86,39 +82,24 @@ def get_batch_pipe_packed(data): loss_on_targets_only=False # This is done below ) # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding - loss_mask *= torch.logical_and((decoder_causal_attention - 1) * -1, tokens) - loss_mask = loss_mask.to(datatype) + loss_on_targets_only = 1 - data_c['decoder_is_inputs'][:, 1:] + loss_on_non_pad_only = (tokens != tokenizer.pad) + loss_mask *= loss_on_targets_only * loss_on_non_pad_only attention_mask = get_packed_attention_mask( - causal_mask=causal_mask, - tokens=tokens, - decoder_causal_attention=decoder_causal_attention, - segment_ids=segment_ids, - datatype=datatype, + # Run non-causal decoder + is_causal=False, + causal_mask=causal_mask.bool(), + decoder_is_inputs=decoder_is_inputs.bool(), + segment_ids=segment_ids.long(), ) - if args.curriculum_learning and args.curriculum_seqlen < tokens.size()[1]: - # seqlen-based curriculum learning - # tokens, position_ids, labels, loss_mask have size [batch size, seqlen] - tokens = tokens[:, :args.curriculum_seqlen].contiguous() - position_ids = position_ids[:, :args.curriculum_seqlen].contiguous() - labels = labels[:, :args.curriculum_seqlen].contiguous() - loss_mask = loss_mask[:, :args.curriculum_seqlen].contiguous() + if args.position_embedding_type not in [PositionEmbeddingType.alibi, PositionEmbeddingType.rotary]: + raise NotImplementedError("absolute positional embeddings require us to reset position_ids accordingly.") return (tokens, position_ids, attention_mask), (labels, loss_mask) -def loss_func(loss_mask, output_tensor): - losses = output_tensor.float() - loss_mask = loss_mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() - - # Reduce loss for logging. - averaged_loss = average_losses_across_data_parallel_group([loss]) - - return loss, {'lm loss': averaged_loss[0]} - - def train_valid_test_datasets_provider(train_val_test_num_samples): """Build train, valid, and test datasets.""" args = get_args() @@ -128,18 +109,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): # Option 1 of data loading using --data-path # For T0, data has to be provided in the form --data-path input-data target-data input-data2 target-data2 ... if args.data_path: - - # Turn into list of pairs; Overwrite args.data_path to keep len = 1 # TODO: Not yet compatible with dataset weights (Will break at prefixes, weights = analyze_data_prefix(args.data_path)) - assert len(args.data_path) > 1, "Please provide data in pairs of two: input_tokens target_tokens" - args.data_path = [{"input_tokens": args.data_path[i], "target_tokens": args.data_path[i+1]} for i in range(0, len(args.data_path), 2)] - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length, seed=args.seed, skip_warmup=(not args.mmap_warmup)) else: @@ -150,10 +125,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): @record def main(): - pretrain(train_valid_test_datasets_provider, - model_provider, - forward_step_func=None, - args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) + pretrain( + train_valid_test_datasets_provider, + model_provider, + forward_step_func=None, + args_defaults={} + ) if __name__ == "__main__": main() diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 638aa455a..d0bbe17cd 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -16,17 +16,12 @@ """Dataloaders.""" from functools import partial -import logging - -import numpy as np import torch from megatron import get_args, get_tokenizer from megatron import mpu from megatron.data.mtf_dataset import MTFDataset -logger = logging.get_logger(__name__) - def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int): """ @@ -45,14 +40,14 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) ] Output: - decoder_target_tokens = [[6, 7, 8, 3, 4, 5, ]]: Concatenation of tokens followed with padding tokens. + decoder_tokens = [[6, 7, 8, 3, 4, 5, ]]: Concatenation of tokens followed with padding tokens. decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]: Segment ids determine original documents. - decoder_causal_attention = [[1, 1, 0, 1, 1, 0, 0]]: `0` depicts inputs, `1` depicts target. + decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]: `1` depicts inputs, `0` depicts target. """ - decoder_target_tokens = np.full((micro_batch_size, max_seq_len), pad_token) - decoder_segment_ids = np.zeros((micro_batch_size, max_seq_len)) - decoder_causal_attention = np.zeros((micro_batch_size, max_seq_len)) + decoder_tokens = torch.full((micro_batch_size, max_seq_len), pad_token) + decoder_segment_ids = torch.zeros((micro_batch_size, max_seq_len)) + decoder_is_inputs = torch.full((micro_batch_size, max_seq_len), False, dtype=torch.bool) batch_num = 0 # `0` is reserved for padding @@ -64,22 +59,21 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) total_len = input_token_len + target_token_len if cur_len + total_len > max_seq_len: len_diff = max_seq_len - cur_len - logger.info(f"Loosing {len_diff} tokens to padding.") # Padding if len_diff > 0: - decoder_target_tokens[batch_num][cur_len: max_seq_len] = pad_token + decoder_tokens[batch_num][cur_len: max_seq_len] = pad_token decoder_segment_ids[batch_num][cur_len: max_seq_len] = 0 - decoder_causal_attention[batch_num][cur_len: max_seq_len] = 0 + # padded values are already 0, no need to update `decoder_is_inputs` batch_num += 1 assert batch_num < micro_batch_size item_num = 1 cur_len = 0 - decoder_target_tokens[batch_num][cur_len: cur_len + input_token_len] = token_dict["input_tokens"] - decoder_target_tokens[batch_num][cur_len + input_token_len: cur_len + total_len] = token_dict["target_tokens"] + decoder_tokens[batch_num][cur_len: cur_len + input_token_len] = token_dict["input_tokens"] + decoder_tokens[batch_num][cur_len + input_token_len: cur_len + total_len] = token_dict["target_tokens"] decoder_segment_ids[batch_num][cur_len: cur_len + total_len] = item_num - decoder_causal_attention[batch_num][cur_len: cur_len + input_token_len] = 1 # input - decoder_causal_attention[batch_num][cur_len + input_token_len: cur_len + total_len] = 0 # target + decoder_is_inputs[batch_num][cur_len: cur_len + input_token_len] = 1 # inputs + # targets are already 0 at init, no need to update `decoder_is_inputs` item_num += 1 cur_len += total_len @@ -87,9 +81,9 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) # Normally the default collate_fn handles torch tensor conversion; As we use a custom collate_fn, do it here return { - "decoder_target_tokens": decoder_target_tokens, + "decoder_tokens": decoder_tokens, "decoder_segment_ids": decoder_segment_ids, - "decoder_causal_attention": decoder_causal_attention, + "decoder_is_inputs": decoder_is_inputs, } @@ -232,32 +226,6 @@ def __init__(self, total_samples, consumed_samples, micro_batch_size, def __len__(self): return self.total_samples - def __iter__(self): - active_total_samples = self.total_samples - self.last_batch_size - self.epoch = self.consumed_samples // active_total_samples - current_epoch_samples = self.consumed_samples % active_total_samples - assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 - - # data sharding and random sampling - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size - bucket_offset = current_epoch_samples // self.data_parallel_size - start_idx = self.data_parallel_rank * bucket_size - - g = torch.Generator() - g.manual_seed(self.epoch) - random_idx = torch.randperm(bucket_size, generator=g).tolist() - idx_range = [start_idx + x for x in random_idx[bucket_offset:]] - - batch = [] - # Last batch if not complete will be dropped. - for idx in idx_range: - batch.append(idx) - if len(batch) == self.micro_batch_size: - self.consumed_samples += self.micro_batch_times_data_parallel_size - yield batch - batch = [] - class MegatronPackedRandomSampler(object): """docstring for MegatronPackedRandomSampler""" diff --git a/megatron/training.py b/megatron/training.py index 9f006bc30..c1033c997 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1178,23 +1178,23 @@ def build_train_valid_test_data_iterators( # Build iterators. dl_type = args.dataloader_type - assert dl_type in ['single', 'cyclic', 'packed'] + assert dl_type in ['single', 'cyclic', 'decoder_packed'] if train_dataloader is not None: - train_data_iterator = iter(train_dataloader) if dl_type in ['single', 'packed'] \ + train_data_iterator = iter(train_dataloader) if dl_type in ['single', 'decoder_packed'] \ else iter(cyclic_iter(train_dataloader)) else: train_data_iterator = None if valid_dataloaders is not None: - valid_data_iterators = [iter(vdl) if dl_type in ['single', 'packed'] \ + valid_data_iterators = [iter(vdl) if dl_type in ['single', 'decoder_packed'] \ else iter(cyclic_iter(valid_dataloaders)) for vdl in valid_dataloaders] else: valid_data_iterators = [None] * num_valid_ds if test_dataloaders is not None: - test_data_iterators = [iter(tdl) if dl_type in ['single', 'packed'] \ + test_data_iterators = [iter(tdl) if dl_type in ['single', 'decoder_packed'] \ else iter(cyclic_iter(test_dataloaders)) for tdl in test_dataloaders] else: diff --git a/megatron/utils.py b/megatron/utils.py index e57079e6f..6b78ee40c 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -250,12 +250,18 @@ def get_ltor_masks_and_position_ids( return attention_mask, loss_mask, position_ids -def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, segment_ids, datatype=torch.int64): +def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decoder_is_inputs: torch.Tensor, segment_ids: torch.Tensor): """ Inspired by https://github.com/google-research/t5x/blob/7193407f98a8b18100b71a04ff777238be1682ca/t5x/examples/decoder_only/layers.py#L978 + + Arguments: + - is_causal: determines if the masking should be causal in the `inputs` part + - causal_mask: torch.BoolTensor [batch_size, sequence_length, sequence_length] + - decoder_is_inputs: torch.BoolTensor [batch_size, sequence_length] + - segment_ids: torch.IntTensor [batch_size, sequence_length] + Returns: + - attention_mask: torch.BoolTensor [batch_size, 1, sequence_length, sequence_length] """ - inputs_mask = decoder_causal_attention.unsqueeze(-1) * decoder_causal_attention.unsqueeze(1) - inputs_mask = inputs_mask.unsqueeze(1) """Causal Inputs Mask: mask = [[[[1, 1, 0, 0, 0, 0, 0], @@ -266,7 +272,14 @@ def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, seg [1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1]]]] """ - causal_inputs_mask = torch.logical_or(causal_mask, inputs_mask).to(datatype) + assert causal_mask.dtype == torch.bool + assert segment_ids.dtype == torch.long + if is_causal: + causal_inputs_mask = causal_mask + else: + assert decoder_is_inputs.dtype == torch.bool + inputs_mask = decoder_is_inputs[:, :, None] * decoder_is_inputs[:, None, :] + causal_inputs_mask = causal_mask + inputs_mask """Padding Mask: mask = [[[[1, 1, 1, 1, 1, 1, 0], @@ -277,9 +290,7 @@ def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, seg [1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0]]]] """ - padding_mask = (tokens > 0).unsqueeze(-1) * (tokens > 0).unsqueeze(1) - padding_mask = padding_mask.unsqueeze(1) - + padding_mask = (segment_ids != 0)[:, :, None] * (segment_ids != 0)[:, None, :] """Segment Mask: mask = [[[[1, 1, 1, 0, 0, 0, 0], @@ -290,8 +301,7 @@ def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, seg [0, 0, 0, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0]]]] """ - segment_mask = (segment_ids.unsqueeze(-1)) == (segment_ids.unsqueeze(1)) - segment_mask = segment_mask.unsqueeze(1) + segment_mask = segment_ids[:, :, None] == segment_ids[:, None, :] """Final Mask: mask = [[[[1, 1, 0, 0, 0, 0, 0], @@ -307,7 +317,7 @@ def get_packed_attention_mask(causal_mask, tokens, decoder_causal_attention, seg # Convert attention mask to binary: attention_mask = (attention_mask < 0.5) - return attention_mask + return attention_mask[:, None, ...] def param_size(parameter): return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement() diff --git a/prepare_tokenizer.py b/prepare_tokenizer.py deleted file mode 100644 index e058ac62a..000000000 --- a/prepare_tokenizer.py +++ /dev/null @@ -1,16 +0,0 @@ -from transformers import AutoTokenizer, AddedToken - -tokenizer = AutoTokenizer.from_pretrained('bigscience/tokenizer') - -tokenizer.add_special_tokens({ - 'additional_special_tokens': [ - AddedToken( - ''.format(str(idx).zfill(3)), - lstrip=False, - rstrip=False, - normalization=False - ) for idx in reversed(range(0,200)) - ] - }) - -tokenizer.save_pretrained('bigscience-tokenizer-padded') \ No newline at end of file diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index bd627bae4..213c1af4f 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -169,7 +169,7 @@ def test_mtf_packed_dataloader(self): last_padding_size = 0 for i, items in enumerate(batch_sampler): - micro_batch_size, seq_length = items["decoder_target_tokens"].shape + micro_batch_size, seq_length = items["decoder_tokens"].shape # `micro_batch_size` correspond to the one in argument self.assertEqual(micro_batch_size, args.micro_batch_size) diff --git a/tests/test_packing_dataloader.py b/tests/test_packing_dataloader.py deleted file mode 100644 index d5af66d62..000000000 --- a/tests/test_packing_dataloader.py +++ /dev/null @@ -1,37 +0,0 @@ -import os -import torch.distributed as dist - -from megatron.initialize import initialize_megatron -# from megatron.data.data_samplers import MegatronPackedRandomSampler -from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group - -#Initialize Megatron with dummy variables -initialize_megatron( - extra_args_provider=None, - args_defaults={ - "micro_batch_size": 4, - "num_layers": 4, - "hidden_size": 64, - "num_attention_heads": 4, - "seq_length": 256, - "max_position_embeddings": 256, - } - ) - -train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=["tests/data/gpt2/meg-gpt2-openwebtext_text_document"], - data_impl="mmap", - splits_string="90,5,5", - train_valid_test_num_samples=[100,100,100], - seq_length=1024, - seed=124, - skip_warmup=True - ) - -dl = torch.utils.data.DataLoader( - train_ds, - batch_size=4, - # batch_sampler=batch_sampler, - num_workers=4, - pin_memory=True - ) diff --git a/tests/test_training.py b/tests/test_training.py index f19feee6f..260a54ba3 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -469,10 +469,7 @@ def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_ self.assertEqual(len(tensorboard_files), 2, "tensorboard files") def test_training_t0(self): - - src_dir = self.src_dir - data_dir = f"{self.data_dir}/gpt2" - data_dir_t0 = f"{self.data_dir}/t0" + data_path = f"{self.data_dir}/gpt2/ag_news_prompt" output_dir = self.get_auto_remove_tmp_dir() logs_dir = f"{output_dir}/logs" Path(logs_dir).mkdir(parents=True, exist_ok=True) @@ -514,14 +511,13 @@ def test_training_t0(self): --eval-iters 5 --checkpoint-activations --exit-interval {exit_interval} - - --merge-file {data_dir}/gpt2-tiny-merges.txt - --vocab-file {data_dir}/gpt2-tiny-vocab.json + --tokenizer-type PretrainedFromHF + --tokenizer-name-or-path gpt2 --log-path {logs_dir} --save {output_dir}/checkpoints --load {output_dir}/checkpoints - --data-path {data_dir_t0}/ag_news_prompt_test_inputs_document {data_dir_t0}/ag_news_prompt_test_targets_document - --dataloader-type packed + --data-path {data_path} + --dataloader-type decoder_packed --split 90,10,0 --tensorboard-dir {output_dir}/tensorboard --tensorboard-queue-size 5 @@ -539,7 +535,7 @@ def test_training_t0(self): --deepspeed-activation-checkpointing """.split() - script = [f"{src_dir}/finetune_t0.py"] + script = [f"{self.src_dir}/finetune_t0_non_causal_decoder.py"] launcher = get_launcher(num_gpus) cmd = launcher + script + args + ds_args @@ -583,6 +579,7 @@ def test_training_t0(self): # test tensorboard (1 file from the first run, plus 1 now) tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*") self.assertEqual(len(tensorboard_files), 2, "tensorboard files") + @parameterized.expand(["gpt", "prefix", "no_eval"]) def test_mode2_dataloading(self, variation): src_dir = self.src_dir From 4faa7434976b511fb1ba10c53200581164092976 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:20:10 +0200 Subject: [PATCH 193/297] WIP --- megatron/enums.py | 3 +- megatron/model/fused_softmax.py | 3 +- megatron/model/gpt_model.py | 6 +-- megatron/model/transformer.py | 3 +- pretrain_gpt.py | 33 +++++++-------- pretrain_prefix_lm.py | 5 ++- tests/test_dataloaders.py | 73 +++++++++++++++++++++++++++++++++ tests/test_model.py | 7 ++++ 8 files changed, 109 insertions(+), 24 deletions(-) diff --git a/megatron/enums.py b/megatron/enums.py index d9050462a..90d00a071 100644 --- a/megatron/enums.py +++ b/megatron/enums.py @@ -25,8 +25,9 @@ class AttnType(enum.Enum): class AttnMaskType(enum.Enum): padding = 1 - causal = 2 + causal = 2 # Overrides `attention_mask` to be a lower triangular matrix prefix = 3 + custom = 4 # Forces one to pass an `attention_mask` that's 1 if we need to mask. Tensor that can be broadcast to [micro_batch_size, n_head, seq_length, seq_length] class PositionEmbeddingType(enum.Enum): rotary = 1 diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index a4a788586..a3054d730 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -124,7 +124,6 @@ class FusedScaleMaskSoftmax(nn.Module): softmax_in_fp32: if true, softmax in performed at fp32 precision. scale: scaling factor used in input tensor scaling. """ - custom_kernel_friendly_attn_mask_type = [AttnMaskType.causal, AttnMaskType.padding] def __init__( self, @@ -187,7 +186,7 @@ def forward_fused_softmax(self, input, mask): b, np, sq, sk = input.size() scale = self.scale if self.scale is not None else 1.0 - if self.attn_mask_type == AttnMaskType.causal: + if self.attn_mask_type == AttnMaskType.causal and mask is None: assert sq == sk, "causal mask is only for self attention" # input is 3D tensor (attn_batches, sq, sk) diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index 31d33a91b..dce77d23d 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -202,7 +202,7 @@ def __init__( self, num_tokentypes=0, parallel_output=True, - prefix_lm=False + attn_mask_type: AttnMaskType = AttnMaskType.causal ): args = get_args() self.parallel_output = parallel_output @@ -252,7 +252,7 @@ def _to_float16(inputs): args.num_layers), layer_number=layer_idx, # TODO: Change naming of class from GPT to something that encapsulate prefix lm. - self_attn_mask_type=AttnMaskType.prefix if prefix_lm else AttnMaskType.causal)) + self_attn_mask_type=attn_mask_type)) if not hasattr(args, 'attn_mask'): @@ -314,7 +314,7 @@ def _logits_helper(embedding, lm_output): partition_method = 'type:transformer' super().__init__(layers=self.specs, - loss_fn=get_cross_entropy(is_prefix=prefix_lm), + loss_fn=get_cross_entropy(is_prefix=attn_mask_type is AttnMaskType.prefix), topology=topo, activation_checkpoint_interval=interval, partition_method=partition_method) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 48401a9f1..9232d84a5 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -333,6 +333,7 @@ def forward(self, hidden_states, attention_mask, layer_past=None, if get_key_value: with torch.no_grad(): + # TODO @thomasw21 Handle case where `attention_mask` is None if layer_past is not None: attention_mask = attention_mask[ ..., @@ -643,7 +644,7 @@ def forward(self, inputs, **kwargs): # No attention mask forwarded, search for args.attn_mask if not hasattr(self, '_args'): self._args = get_args() - hidden_states, attention_mask = inputs, self._args.attn_mask + hidden_states, attention_mask = inputs, None return super().forward(hidden_states, attention_mask, **kwargs) elif len(inputs) == 2: # Attention mask is an activation. diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 04f1b3b57..faa45050c 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -53,22 +53,23 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed: - # Precompute the attention mask and store it in args. This avoids having to - # pipeline it as an activation during training. The mask is constant, and thus - # we can reuse it. - attention_mask = torch.tril(torch.ones( - (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view( - 1, 1, args.seq_length, args.seq_length) - - # Convert attention mask to binary: - attention_mask = (attention_mask < 0.5) - if args.fp16: - attention_mask = attention_mask.half() - elif args.bf16: - attention_mask = attention_mask.bfloat16() - - # must be bool or the training crashes expecting bool, but getting Half - args.attn_mask = attention_mask.to(torch.bool) + # We don't need it. + # # Precompute the attention mask and store it in args. This avoids having to + # # pipeline it as an activation during training. The mask is constant, and thus + # # we can reuse it. + # attention_mask = torch.tril(torch.ones( + # (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view( + # 1, 1, args.seq_length, args.seq_length) + # + # # Convert attention mask to binary: + # attention_mask = (attention_mask < 0.5) + # if args.fp16: + # attention_mask = attention_mask.half() + # elif args.bf16: + # attention_mask = attention_mask.bfloat16() + # + # # must be bool or the training crashes expecting bool, but getting Half + # args.attn_mask = attention_mask.to(torch.bool) model = GPTModelPipe( num_tokentypes=0, diff --git a/pretrain_prefix_lm.py b/pretrain_prefix_lm.py index 391186e75..f0f97d566 100644 --- a/pretrain_prefix_lm.py +++ b/pretrain_prefix_lm.py @@ -23,6 +23,7 @@ from megatron import get_tokenizer from megatron import mpu from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.enums import AttnMaskType from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ @@ -46,10 +47,12 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed: + assert args.attn_mask_type == AttnMaskType.prefix + model = GPTModelPipe( num_tokentypes=0, parallel_output=True, - prefix_lm=True + attn_mask_type=args.attn_mask_type ) # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 213c1af4f..a936b96df 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -1,8 +1,11 @@ import itertools +from typing import List, Set from unittest.mock import patch import deepspeed +import torch +import finetune_t0_non_causal_decoder from megatron import global_vars, get_tokenizer, initialize_megatron, get_args from megatron.data import mlm_dataset, mtf_dataset from megatron.data.data_samplers import build_pretraining_data_loader @@ -48,6 +51,44 @@ def get_default_args(): # DATA_ARGS } +def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]): + seq_length += 1 + + num_segments = torch.randint(1, 5, ()) + segment_ids = torch.zeros(micro_batch_size, seq_length) + is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool) + for batch_id in range(micro_batch_size): + # - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least + # - `+1`: Hack in order the start_mew_segments not to be 0 + start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1 + segment_ids[batch_id, start_new_segments] = 1 + + end_inputs = [ + torch.randint(low=start_segment, high=end_segment, size=()) + for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length]) + ] + for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]): + is_inputs[batch_id][start_segment: end_input + 1] = True + + segment_ids = torch.cumsum(segment_ids, dim=-1) + 1 + tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length)) + + flatten_token_view = tokens.view(-1,) + for token_id in range(len(flatten_token_view)): + token = flatten_token_view[token_id] + # While token is a special tokens we change that token + while token in special_tokens_ids: + flatten_token_view[token_id] = (token + 1) % vocab_size + token = flatten_token_view[token_id] + + return { + "decoder_tokens": tokens, + "decoder_segment_ids": segment_ids, + "decoder_is_inputs": is_inputs + } + +get_dummy_mtf_decoder_packed_data(1, 128, 52000, set([1,2,3,4])) + class TestDataLoading(TestCasePlus): def setUp(self) -> None: super().setUp() @@ -196,3 +237,35 @@ def test_mtf_packed_dataloader(self): # update `last_padding_size` last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0]) + def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): + command_args = get_default_args() + + with patch('sys.argv', flatten_arguments(command_args)): + with mockenv_context(**self.dist_env_1_gpu): + deepspeed.init_distributed() + initialize_megatron() + + args = get_args() + tokenizer = get_tokenizer() + # Dummy data + data = get_dummy_mtf_decoder_packed_data( + micro_batch_size=args.micro_batch_size, + seq_length=args.seq_length, + vocab_size=args.padded_vocab_size, + special_tokens_ids={tokenizer.pad} + ) + + (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) + + self.assertEqual(loss_mask, data["decoder_is_inputs"][:, 1:]) + self.assertEqual(tokens, data["decoder_tokens"][:, :-1]) + self.assertEqual(labels, data["decoder_tokens"][:, 1:]) + + # TODO @thomasw21 check that attention_mask is `1` between segments, ie segments are independent + segment_cuts = torch.nonzero(data["decoder_segment_ids"][:, 1:] - data["decoder_segment_ids"][:, :-1]) + for batch_id in range(args.micro_batch_size): + for segment_start, segment_end in zip([0, *segment_cuts[batch_id]], [*segment_cuts[batch_id], args.seq_length]): + self.assertEqual(attention_mask[batch_id, segment_start: segment_end, :segment_start], 1) + self.assertEqual(attention_mask[batch_id, segment_start: segment_end, segment_end:], 1) + + # TODO @thomasw21 make sure that we reset `position_ids` diff --git a/tests/test_model.py b/tests/test_model.py index 6defd784d..6fb1074b2 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -307,6 +307,13 @@ def test_fused_layer_norm(self): torch_assert_equal(mfln_output, torch_layer_norm_output) + def test_gpt_model_passed_with_attention_mask_is_not_causal(self): + # TODO @thomasw21 make sure that if pass a causal mask, it is take in account. The following shows that fused_kernel completely ignores the masking is we set the variable incorrectly. + # https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/131bd43e9f3552f2413a442f51c22214d4f6fb19/megatron/model/fused_softmax.py#L190 + # Maybe we should pass None is case as attention_mask instead of silently ignoring mask. + # We should test that is we modify an element in a segment, it only affects that segment. + raise NotImplementedError() + if __name__ == '__main__': unittest.main() From 3a6d73d1e746a93c11bd5382e875a3078f8e3af9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:29:08 +0200 Subject: [PATCH 194/297] WIP --- megatron/data/data_samplers.py | 4 ++-- tests/test_dataloaders.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index d0bbe17cd..9eee859da 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -45,8 +45,8 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]: `1` depicts inputs, `0` depicts target. """ - decoder_tokens = torch.full((micro_batch_size, max_seq_len), pad_token) - decoder_segment_ids = torch.zeros((micro_batch_size, max_seq_len)) + decoder_tokens = torch.full((micro_batch_size, max_seq_len), pad_token, dtype=torch.int64) + decoder_segment_ids = torch.zeros((micro_batch_size, max_seq_len), dtype=torch.int64) decoder_is_inputs = torch.full((micro_batch_size, max_seq_len), False, dtype=torch.bool) batch_num = 0 diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index a936b96df..a30119fbc 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -55,7 +55,7 @@ def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vo seq_length += 1 num_segments = torch.randint(1, 5, ()) - segment_ids = torch.zeros(micro_batch_size, seq_length) + segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.int64) is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool) for batch_id in range(micro_batch_size): # - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least @@ -71,8 +71,8 @@ def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vo is_inputs[batch_id][start_segment: end_input + 1] = True segment_ids = torch.cumsum(segment_ids, dim=-1) + 1 - tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length)) + tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long) flatten_token_view = tokens.view(-1,) for token_id in range(len(flatten_token_view)): token = flatten_token_view[token_id] @@ -212,6 +212,11 @@ def test_mtf_packed_dataloader(self): for i, items in enumerate(batch_sampler): micro_batch_size, seq_length = items["decoder_tokens"].shape + # Check dtypes + self.assertEqual(items["decoder_tokens"].dtype, torch.int64) + self.assertEqual(items["decoder_segment_ids"].dtype, torch.int64) + self.assertEqual(items["decoder_is_inputs"].dtype, torch.bool) + # `micro_batch_size` correspond to the one in argument self.assertEqual(micro_batch_size, args.micro_batch_size) # `seq_length` correspond to the one in argument + 1 in order to get tokens/labels From ea86bc8f99362c9d3cd27c6d4a3a21d82fec3021 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:31:49 +0200 Subject: [PATCH 195/297] WIP --- finetune_t0_non_causal_decoder.py | 2 +- tests/test_dataloaders.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 66bf43054..4b8717f8a 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -82,7 +82,7 @@ def get_batch_pipe(data): loss_on_targets_only=False # This is done below ) # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding - loss_on_targets_only = 1 - data_c['decoder_is_inputs'][:, 1:] + loss_on_targets_only = ~data_c['decoder_is_inputs'][:, 1:] loss_on_non_pad_only = (tokens != tokenizer.pad) loss_mask *= loss_on_targets_only * loss_on_non_pad_only diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index a30119fbc..bba16e3b9 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -87,8 +87,6 @@ def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vo "decoder_is_inputs": is_inputs } -get_dummy_mtf_decoder_packed_data(1, 128, 52000, set([1,2,3,4])) - class TestDataLoading(TestCasePlus): def setUp(self) -> None: super().setUp() From 638fc56700ee669dc8940bb48388b106d7b8c105 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:35:06 +0200 Subject: [PATCH 196/297] WIP --- tests/test_dataloaders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index bba16e3b9..edc60c247 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -242,6 +242,7 @@ def test_mtf_packed_dataloader(self): def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): command_args = get_default_args() + command_args["--position-embedding-type"] = "alibi" with patch('sys.argv', flatten_arguments(command_args)): with mockenv_context(**self.dist_env_1_gpu): From 66d2afe861ad2feda2317b8966139d3d3a0b4a53 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:40:57 +0200 Subject: [PATCH 197/297] move to cpu for comparison --- tests/test_dataloaders.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index edc60c247..387e817ab 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -261,6 +261,12 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) + tokens = tokens.cpu() + position_ids = position_ids.cpu() + attention_mask = attention_mask.cpu() + labels = labels.cpu() + loss_mask = loss_mask.cpu() + self.assertEqual(loss_mask, data["decoder_is_inputs"][:, 1:]) self.assertEqual(tokens, data["decoder_tokens"][:, :-1]) self.assertEqual(labels, data["decoder_tokens"][:, 1:]) From 3794b86a9c6647b4ff0d689daf998f1ac28fddff Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:44:52 +0200 Subject: [PATCH 198/297] Use torch_assert_equal --- tests/test_dataloaders.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 387e817ab..46d674bdf 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -9,7 +9,7 @@ from megatron import global_vars, get_tokenizer, initialize_megatron, get_args from megatron.data import mlm_dataset, mtf_dataset from megatron.data.data_samplers import build_pretraining_data_loader -from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context +from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context, torch_assert_equal def get_default_args(): @@ -267,15 +267,15 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): labels = labels.cpu() loss_mask = loss_mask.cpu() - self.assertEqual(loss_mask, data["decoder_is_inputs"][:, 1:]) - self.assertEqual(tokens, data["decoder_tokens"][:, :-1]) - self.assertEqual(labels, data["decoder_tokens"][:, 1:]) + torch_assert_equal(loss_mask, data["decoder_is_inputs"][:, 1:]) + torch_assert_equal(tokens, data["decoder_tokens"][:, :-1]) + torch_assert_equal(labels, data["decoder_tokens"][:, 1:]) # TODO @thomasw21 check that attention_mask is `1` between segments, ie segments are independent segment_cuts = torch.nonzero(data["decoder_segment_ids"][:, 1:] - data["decoder_segment_ids"][:, :-1]) for batch_id in range(args.micro_batch_size): for segment_start, segment_end in zip([0, *segment_cuts[batch_id]], [*segment_cuts[batch_id], args.seq_length]): - self.assertEqual(attention_mask[batch_id, segment_start: segment_end, :segment_start], 1) - self.assertEqual(attention_mask[batch_id, segment_start: segment_end, segment_end:], 1) + torch_assert_equal(attention_mask[batch_id, segment_start: segment_end, :segment_start], 1) + torch_assert_equal(attention_mask[batch_id, segment_start: segment_end, segment_end:], 1) # TODO @thomasw21 make sure that we reset `position_ids` From 346b08f937b6b73d1632aed2376cd6b46d7993ce Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:46:53 +0200 Subject: [PATCH 199/297] WIP --- tests/test_dataloaders.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 46d674bdf..0bf0dcfba 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -55,7 +55,7 @@ def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vo seq_length += 1 num_segments = torch.randint(1, 5, ()) - segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.int64) + segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long) is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool) for batch_id in range(micro_batch_size): # - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least @@ -267,7 +267,8 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): labels = labels.cpu() loss_mask = loss_mask.cpu() - torch_assert_equal(loss_mask, data["decoder_is_inputs"][:, 1:]) + self.assertEqual(loss_mask.dtype, torch.float) + torch_assert_equal(loss_mask.bool(), data["decoder_is_inputs"][:, 1:]) torch_assert_equal(tokens, data["decoder_tokens"][:, :-1]) torch_assert_equal(labels, data["decoder_tokens"][:, 1:]) From 4203f6cbbf9ea862c1837f1ca22543dc77ccc966 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:49:56 +0200 Subject: [PATCH 200/297] Take in account pad + fix inverse --- tests/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 0bf0dcfba..d9c3b32fa 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -268,7 +268,7 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): loss_mask = loss_mask.cpu() self.assertEqual(loss_mask.dtype, torch.float) - torch_assert_equal(loss_mask.bool(), data["decoder_is_inputs"][:, 1:]) + torch_assert_equal(loss_mask.bool(), ~data["decoder_is_inputs"][:, 1:] * (data["decoder_tokens"] != tokenizer.pad)) torch_assert_equal(tokens, data["decoder_tokens"][:, :-1]) torch_assert_equal(labels, data["decoder_tokens"][:, 1:]) From bcba2b719f209e79ed8c3ec146a8b92d4f060f0d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:52:33 +0200 Subject: [PATCH 201/297] Tensor and int can't be compared vi torch_assert_equal --- tests/test_dataloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index d9c3b32fa..6442d9830 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -276,7 +276,7 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): segment_cuts = torch.nonzero(data["decoder_segment_ids"][:, 1:] - data["decoder_segment_ids"][:, :-1]) for batch_id in range(args.micro_batch_size): for segment_start, segment_end in zip([0, *segment_cuts[batch_id]], [*segment_cuts[batch_id], args.seq_length]): - torch_assert_equal(attention_mask[batch_id, segment_start: segment_end, :segment_start], 1) - torch_assert_equal(attention_mask[batch_id, segment_start: segment_end, segment_end:], 1) + self.assertTrue(torch.all(attention_mask[batch_id, segment_start: segment_end, :segment_start] == 1)) + torch_assert_equal(torch.all(attention_mask[batch_id, segment_start: segment_end, segment_end:] == 1)) # TODO @thomasw21 make sure that we reset `position_ids` From 57156e1daee630357f0dd77c9f5e51947b231cb2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:53:04 +0200 Subject: [PATCH 202/297] Woops --- tests/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 6442d9830..8b70b8c8d 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -277,6 +277,6 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): for batch_id in range(args.micro_batch_size): for segment_start, segment_end in zip([0, *segment_cuts[batch_id]], [*segment_cuts[batch_id], args.seq_length]): self.assertTrue(torch.all(attention_mask[batch_id, segment_start: segment_end, :segment_start] == 1)) - torch_assert_equal(torch.all(attention_mask[batch_id, segment_start: segment_end, segment_end:] == 1)) + self.assertTrue(torch.all(attention_mask[batch_id, segment_start: segment_end, segment_end:] == 1)) # TODO @thomasw21 make sure that we reset `position_ids` From 45d92189a78c413368fcb70ecb15682da2440daa Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:58:11 +0200 Subject: [PATCH 203/297] Test --- tests/test_dataloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 8b70b8c8d..80cc036b2 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -273,9 +273,9 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): torch_assert_equal(labels, data["decoder_tokens"][:, 1:]) # TODO @thomasw21 check that attention_mask is `1` between segments, ie segments are independent - segment_cuts = torch.nonzero(data["decoder_segment_ids"][:, 1:] - data["decoder_segment_ids"][:, :-1]) for batch_id in range(args.micro_batch_size): - for segment_start, segment_end in zip([0, *segment_cuts[batch_id]], [*segment_cuts[batch_id], args.seq_length]): + segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) + for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]): self.assertTrue(torch.all(attention_mask[batch_id, segment_start: segment_end, :segment_start] == 1)) self.assertTrue(torch.all(attention_mask[batch_id, segment_start: segment_end, segment_end:] == 1)) From 959fc71dbf9cd6e34158bcdd5094b68bbce0a877 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:00:26 +0200 Subject: [PATCH 204/297] Woops --- tests/test_dataloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 80cc036b2..f8a0ffae9 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -276,7 +276,7 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): for batch_id in range(args.micro_batch_size): segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]): - self.assertTrue(torch.all(attention_mask[batch_id, segment_start: segment_end, :segment_start] == 1)) - self.assertTrue(torch.all(attention_mask[batch_id, segment_start: segment_end, segment_end:] == 1)) + self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, :segment_start] == 1)) + self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, segment_end:] == 1)) # TODO @thomasw21 make sure that we reset `position_ids` From 27197fce310d0f6ce4a6e6c5cf5c6aa03670c490 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:03:23 +0200 Subject: [PATCH 205/297] Remove unecessary unsqueeze --- megatron/utils.py | 2 +- tests/test_dataloaders.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/utils.py b/megatron/utils.py index 6b78ee40c..aa2e38821 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -317,7 +317,7 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode # Convert attention mask to binary: attention_mask = (attention_mask < 0.5) - return attention_mask[:, None, ...] + return attention_mask def param_size(parameter): return parameter.ds_numel if hasattr(parameter, 'ds_id') else parameter.nelement() diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index f8a0ffae9..9c3c373f4 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -276,7 +276,7 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): for batch_id in range(args.micro_batch_size): segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]): - self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, :segment_start] == 1)) - self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, segment_end:] == 1)) + self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, :segment_start])) + self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, segment_end:])) # TODO @thomasw21 make sure that we reset `position_ids` From b7374e1c25c55fd3f5ff3aaad57cfb53fbebee4c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:07:57 +0200 Subject: [PATCH 206/297] Add necessary unsqueeze --- megatron/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/utils.py b/megatron/utils.py index aa2e38821..6f3a0fa41 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -278,7 +278,7 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode causal_inputs_mask = causal_mask else: assert decoder_is_inputs.dtype == torch.bool - inputs_mask = decoder_is_inputs[:, :, None] * decoder_is_inputs[:, None, :] + inputs_mask = decoder_is_inputs[:, None, :, None] * decoder_is_inputs[:, None, None, :] causal_inputs_mask = causal_mask + inputs_mask """Padding Mask: @@ -290,7 +290,7 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode [1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0]]]] """ - padding_mask = (segment_ids != 0)[:, :, None] * (segment_ids != 0)[:, None, :] + padding_mask = (segment_ids != 0)[:, None, :, None] * (segment_ids != 0)[:, None, None, :] """Segment Mask: mask = [[[[1, 1, 1, 0, 0, 0, 0], @@ -301,7 +301,7 @@ def get_packed_attention_mask(is_causal: bool, causal_mask: torch.Tensor, decode [0, 0, 0, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0]]]] """ - segment_mask = segment_ids[:, :, None] == segment_ids[:, None, :] + segment_mask = segment_ids[:, None, :, None] == segment_ids[:, None, None, :] """Final Mask: mask = [[[[1, 1, 0, 0, 0, 0, 0], From 4f6b7d32fe23dbf3e89c233b63a27d75c0cb178d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:14:50 +0200 Subject: [PATCH 207/297] I'm stupid --- tests/test_dataloaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 9c3c373f4..140f034a8 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -276,7 +276,7 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): for batch_id in range(args.micro_batch_size): segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]): - self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, :segment_start])) - self.assertTrue(torch.all(attention_mask[batch_id, 1, segment_start: segment_end, segment_end:])) + self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, :segment_start])) + self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, segment_end:])) # TODO @thomasw21 make sure that we reset `position_ids` From 960b17cbd1ce1b0b513e04917a83bacb94eeb32e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:19:18 +0200 Subject: [PATCH 208/297] I'm stupid --- tests/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 140f034a8..63ca03cab 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -274,7 +274,7 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): # TODO @thomasw21 check that attention_mask is `1` between segments, ie segments are independent for batch_id in range(args.micro_batch_size): - segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) + segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) + 1 for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]): self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, :segment_start])) self.assertTrue(torch.all(attention_mask[batch_id, 0, segment_start: segment_end, segment_end:])) From 2b522d11ae1bae9ab5291fc2aa3b04d949d26b3c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:39:15 +0200 Subject: [PATCH 209/297] Tokenizers returns None when trying to access a non existing value --- megatron/tokenizer/tokenizer.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index fcc3ed20d..3f0501b7d 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -355,23 +355,33 @@ def detokenize(self, token_ids): @property def eod(self): - return self.tokenizer.eos_token_id + candidate = self.tokenizer.eos_token_id + self._check_token_candidate(candidate) + return candidate @property def cls(self): - return self.tokenizer.cls_token_id + candidate = self.tokenizer.cls_token_id + self._check_token_candidate(candidate) + return candidate @property def sep(self): - return self.tokenizer.sep_token_id + candidate = self.tokenizer.sep_token_id + self._check_token_candidate(candidate) + return candidate @property def pad(self): - return self.tokenizer.pad_token_id + candidate = self.tokenizer.pad_token_id + self._check_token_candidate(candidate) + return candidate @property def mask(self): - return self.tokenizer.mask_token_id + candidate = self.tokenizer.mask_token_id + self._check_token_candidate(candidate) + return candidate @property def additional_special_tokens_ids(self): @@ -385,3 +395,8 @@ def bos_token_id(self): @property def eos_token_id(self): raise NotImplementedError("Missing ") + + @staticmethod + def _check_token_candidate(candidate): + if candidate is None: + raise AttributeError("Token doesn't exist") From a8fcd386a89fdc626e4bb1fbe707f676c7389139 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 15:49:18 +0200 Subject: [PATCH 210/297] Force gpt2 to have a pad token --- megatron/data/data_samplers.py | 4 ++-- tests/test_dataloaders.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 9eee859da..b6bbdf812 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -69,8 +69,8 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) item_num = 1 cur_len = 0 - decoder_tokens[batch_num][cur_len: cur_len + input_token_len] = token_dict["input_tokens"] - decoder_tokens[batch_num][cur_len + input_token_len: cur_len + total_len] = token_dict["target_tokens"] + decoder_tokens[batch_num][cur_len: cur_len + input_token_len] = torch.from_numpy(token_dict["input_tokens"]) + decoder_tokens[batch_num][cur_len + input_token_len: cur_len + total_len] = torch.from_numpy(token_dict["target_tokens"]) decoder_segment_ids[batch_num][cur_len: cur_len + total_len] = item_num decoder_is_inputs[batch_num][cur_len: cur_len + input_token_len] = 1 # inputs # targets are already 0 at init, no need to update `decoder_is_inputs` diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 63ca03cab..a88859278 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -187,6 +187,10 @@ def test_mtf_packed_dataloader(self): initialize_megatron() args = get_args() + tokenizer = get_tokenizer() + # Hack: `gpt2` doesn't have a padding token, so we override that value. + tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id + train_val_test_num_samples = [ args.train_iters * args.global_batch_size, args.eval_iters * args.global_batch_size, From 7181de456830c0ed3517a3af1c4baaa58e904c03 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 16:40:30 +0200 Subject: [PATCH 211/297] Add a test that the packed_masking works in the modeling side --- tests/test_model.py | 80 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 15 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 6fb1074b2..9bdbf9dd3 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -13,8 +13,11 @@ from megatron import initialize_megatron, get_args, get_tokenizer, global_vars from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal from megatron.training import setup_model_and_optimizer -from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe -from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe +import pretrain_gpt +import pretrain_prefix_lm +import finetune_t0_non_causal_decoder +from tests.test_dataloaders import get_dummy_mtf_decoder_packed_data + def get_default_args(): """return a dictionary with key as argument name and value as additional arguments""" @@ -88,7 +91,7 @@ def test_gpt(self): args = get_args() tokenizer = get_tokenizer() - model, _, _ = setup_model_and_optimizer(gpt_model_provider) + model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -98,7 +101,7 @@ def test_gpt(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch = get_gpt_batch_pipe({"text": token_ids})[0] + input_batch = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) @@ -136,7 +139,7 @@ def test_prefix_lm_reset_attention_mask(self): args = get_args() tokenizer = get_tokenizer() - model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider) + model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -146,7 +149,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch to have non empty prefix - input_batch, (_, loss_mask), prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids}) + input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): for id in prefix_indices[batch_id]: @@ -223,11 +226,11 @@ def test_prefix_lm_wo_reset_attention_mask(self): initialize_megatron() args = get_args() - model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider) + model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - input_batch, (_, loss_mask), prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids}) + input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): id = prefix_indices[batch_id] @@ -254,7 +257,7 @@ def test_gpt_rotary_embeddings(self): args = get_args() tokenizer = get_tokenizer() - model, _, _ = setup_model_and_optimizer(gpt_model_provider) + model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -264,7 +267,7 @@ def test_gpt_rotary_embeddings(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch = get_gpt_batch_pipe({"text": token_ids})[0] + input_batch = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] model(*input_batch) @@ -307,13 +310,60 @@ def test_fused_layer_norm(self): torch_assert_equal(mfln_output, torch_layer_norm_output) - def test_gpt_model_passed_with_attention_mask_is_not_causal(self): + def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self): # TODO @thomasw21 make sure that if pass a causal mask, it is take in account. The following shows that fused_kernel completely ignores the masking is we set the variable incorrectly. # https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/131bd43e9f3552f2413a442f51c22214d4f6fb19/megatron/model/fused_softmax.py#L190 # Maybe we should pass None is case as attention_mask instead of silently ignoring mask. - # We should test that is we modify an element in a segment, it only affects that segment. - raise NotImplementedError() + command_args = get_default_args() + command_args["--position-embedding-type"] = "alibi" + with patch('sys.argv', flatten_arguments(command_args)): + with mockenv_context(**self.dist_env_1_gpu): + deepspeed.init_distributed() + initialize_megatron() + args = get_args() + tokenizer = get_tokenizer() -if __name__ == '__main__': - unittest.main() + data = get_dummy_mtf_decoder_packed_data( + micro_batch_size=args.micro_batch_size, + seq_length=args.seq_length, + vocab_size=args.padded_vocab_size, + special_tokens_ids={tokenizer.pad} + ) + model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider) + model = model[0] + + (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) + + output = model(tokens, position_ids, attention_mask) + + ## --------------- CHANGE A TARGET TOKEN --------------------------- + # change the first token in the first batch + change_batch_id = 0 + change_token_id = 0 + token_ids_changed_target = tokens[0].clone() + # We increment the token id on the changed index. + token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size + while token_ids_changed_target[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}: + token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size + + # Test change + output_changed_target = model(token_ids_changed_target, position_ids, attention_mask) + + first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0] + # Check that values changed in segment 1 of batch_id 0 + self.assertFalse(torch.any( + equal_vectors( + output[change_batch_id, change_token_id:first_segment_first_batch_id_end], + output_changed_target[change_batch_id, change_token_id:first_segment_first_batch_id_end] + ) + )) + # Check that values did not change in other segments of batch_id 0 + torch_assert_equal( + output[change_batch_id, first_segment_first_batch_id_end:], + output_changed_target[change_batch_id, first_segment_first_batch_id_end:] + ) + # Check that values did not change in other segments of batch_id > 0 + torch_assert_equal(output[:change_batch_id:], output_changed_target[:change_batch_id]) + if change_batch_id + 1 < len(output): + torch_assert_equal(output[change_batch_id + 1:], output_changed_target[change_batch_id + 1:]) From 172306b060377a2af11027547468775759cc5c96 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 16:45:47 +0200 Subject: [PATCH 212/297] Import error --- tests/test_dataloaders.py | 2 +- tests/test_model.py | 41 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index a88859278..effbd6b02 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -1,5 +1,5 @@ import itertools -from typing import List, Set +from typing import Set from unittest.mock import patch import deepspeed diff --git a/tests/test_model.py b/tests/test_model.py index 9bdbf9dd3..a4f288ab9 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,5 +1,5 @@ -import unittest from random import randint +from typing import Set from unittest.mock import patch import deepspeed @@ -16,7 +16,6 @@ import pretrain_gpt import pretrain_prefix_lm import finetune_t0_non_causal_decoder -from tests.test_dataloaders import get_dummy_mtf_decoder_packed_data def get_default_args(): @@ -64,6 +63,44 @@ def equal_vectors(tensor1, tensor2, dim=-1): return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0 +def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]): + """Code from `tests/test_dataloaders.py""" + seq_length += 1 + + num_segments = torch.randint(1, 5, ()) + segment_ids = torch.zeros(micro_batch_size, seq_length, dtype=torch.long) + is_inputs = torch.zeros(micro_batch_size, seq_length, dtype=torch.bool) + for batch_id in range(micro_batch_size): + # - `*2`: Hack in order to two start_new_segements to be seperated with two tokens at least + # - `+1`: Hack in order the start_mew_segments not to be 0 + start_new_segments = torch.sort(torch.randperm((seq_length - 2) // 2, )[:num_segments]).values * 2 + 1 + segment_ids[batch_id, start_new_segments] = 1 + + end_inputs = [ + torch.randint(low=start_segment, high=end_segment, size=()) + for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length]) + ] + for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]): + is_inputs[batch_id][start_segment: end_input + 1] = True + + segment_ids = torch.cumsum(segment_ids, dim=-1) + 1 + + tokens = torch.randint(high=vocab_size, size=(micro_batch_size, seq_length), dtype=torch.long) + flatten_token_view = tokens.view(-1,) + for token_id in range(len(flatten_token_view)): + token = flatten_token_view[token_id] + # While token is a special tokens we change that token + while token in special_tokens_ids: + flatten_token_view[token_id] = (token + 1) % vocab_size + token = flatten_token_view[token_id] + + return { + "decoder_tokens": tokens, + "decoder_segment_ids": segment_ids, + "decoder_is_inputs": is_inputs + } + + class MyTestCase(TestCasePlus): def setUp(self) -> None: super().setUp() From a4854bd20d1d3edcfbecf39b1b95add5fc94752e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 16:48:28 +0200 Subject: [PATCH 213/297] Tokenizer requires to have pad token --- tests/test_dataloaders.py | 3 +++ tests/test_model.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index effbd6b02..38acfee38 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -255,6 +255,9 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): args = get_args() tokenizer = get_tokenizer() + # Hack: `gpt2` doesn't have a padding token, so we override that value. + tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id + # Dummy data data = get_dummy_mtf_decoder_packed_data( micro_batch_size=args.micro_batch_size, diff --git a/tests/test_model.py b/tests/test_model.py index a4f288ab9..da0d123b6 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -358,8 +358,11 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i with mockenv_context(**self.dist_env_1_gpu): deepspeed.init_distributed() initialize_megatron() + args = get_args() tokenizer = get_tokenizer() + # Hack: `gpt2` doesn't have a padding token, so we override that value. + tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id data = get_dummy_mtf_decoder_packed_data( micro_batch_size=args.micro_batch_size, From 06c29a9a3265a0fac8612c7028fcc4f1a335b402 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 16:52:44 +0200 Subject: [PATCH 214/297] Turns out that test_model.py did not use deepspeed version of models --- tests/test_model.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index da0d123b6..6da71a469 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -18,7 +18,7 @@ import finetune_t0_non_causal_decoder -def get_default_args(): +def get_default_args(test_file_dir: str): """return a dictionary with key as argument name and value as additional arguments""" return { # GPT_ARGS @@ -55,6 +55,12 @@ def get_default_args(): "--checkpoint-activations": "", # DATA_ARGS + + # DeepSpeed args + "--deepspeed": "", + "--deepspeed_config": f"{test_file_dir}/ds_config.json", + "--zero-stage": "1", + "--deepspeed-activation-checkpointing": "" } @@ -119,7 +125,7 @@ def setUp(self) -> None: def test_gpt(self): """Test causal invariance, ie past token don't depend on future tokens.""" - command_args = get_default_args() + command_args = get_default_args(self.test_file_dir_str) with patch('sys.argv', flatten_arguments(command_args)): with mockenv_context(**self.dist_env_1_gpu): @@ -164,7 +170,7 @@ def test_prefix_lm_reset_attention_mask(self): - Target tokens depend on input tokens. - Input tokens depend on all other input tokens, but never target tokens. """ - command_args = get_default_args() + command_args = get_default_args(self.test_file_dir_str) command_args["--reset-attention-mask"] = "" command_args["--loss-on-targets-only"] = "" @@ -253,7 +259,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): - Target tokens depend on input tokens. - Input tokens depend on all other input tokens, but never target tokens. """ - command_args = get_default_args() + command_args = get_default_args(self.test_file_dir_str) command_args["--loss-on-targets-only"] = "" @@ -282,7 +288,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): def test_gpt_rotary_embeddings(self): """Test rotary embeddings""" - command_args = get_default_args() + command_args = get_default_args(self.test_file_dir_str) del command_args["--max-position-embeddings"] command_args["--position-embedding-type"] = "rotary" @@ -311,7 +317,7 @@ def test_gpt_rotary_embeddings(self): #TODO: Check all invariants def test_fused_layer_norm(self): - command_args = get_default_args() + command_args = get_default_args(self.test_file_dir_str) # Condition to use custom cuda kernel command_args["--bf16"] = "" @@ -351,7 +357,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i # TODO @thomasw21 make sure that if pass a causal mask, it is take in account. The following shows that fused_kernel completely ignores the masking is we set the variable incorrectly. # https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/131bd43e9f3552f2413a442f51c22214d4f6fb19/megatron/model/fused_softmax.py#L190 # Maybe we should pass None is case as attention_mask instead of silently ignoring mask. - command_args = get_default_args() + command_args = get_default_args(self.test_file_dir_str) command_args["--position-embedding-type"] = "alibi" with patch('sys.argv', flatten_arguments(command_args)): From aba48b3f447daf7c249513763e6db715fdce892a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 17:22:26 +0200 Subject: [PATCH 215/297] Use train_batch instead --- tests/test_model.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 6da71a469..62ca414ae 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -43,6 +43,7 @@ def get_default_args(test_file_dir: str): "--clip-grad": "1.0", "--lr-warmup-fraction": ".01", "--fp16": "", + "--inference": "", "--attention-dropout": "0", "--hidden-dropout": "0", @@ -153,8 +154,8 @@ def test_gpt(self): input_token_ids_changed[:, changed_index] = \ (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size - output = model(*input_batch) - output_changed = model(input_token_ids_changed, *input_batch[1:]) + output = model.train_batch(*input_batch) + output_changed = model.train_batch(input_token_ids_changed, *input_batch[1:]) # All token in past should be unchanged torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index]) @@ -201,7 +202,7 @@ def test_prefix_lm_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - output = model(*input_batch) + output = model.train_batch(*input_batch) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -216,7 +217,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model(token_ids_changed_target, *input_batch[1:]) + output_changed_target = model.train_batch(token_ids_changed_target, *input_batch[1:]) # All token in past should be unchanged torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index]) @@ -241,7 +242,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model(token_ids_changed_input, *input_batch[1:]) + output_changed_input = model.train_batch(token_ids_changed_input, *input_batch[1:]) # All tokens should be changed self.assertFalse( @@ -282,7 +283,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - model(*input_batch) + model.train_batch(*input_batch) #TODO: Check all invariants @@ -312,7 +313,7 @@ def test_gpt_rotary_embeddings(self): # process batch input_batch = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] - model(*input_batch) + model.train_batch(*input_batch) #TODO: Check all invariants @@ -381,7 +382,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) - output = model(tokens, position_ids, attention_mask) + output = model.train_batch(tokens, position_ids, attention_mask) ## --------------- CHANGE A TARGET TOKEN --------------------------- # change the first token in the first batch @@ -394,7 +395,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size # Test change - output_changed_target = model(token_ids_changed_target, position_ids, attention_mask) + output_changed_target = model.train_batch(token_ids_changed_target, position_ids, attention_mask) first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0] # Check that values changed in segment 1 of batch_id 0 From a9d423a4473ff6c01df1a4ff2e1f783932d38b76 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 17:29:21 +0200 Subject: [PATCH 216/297] Make it work via DS --- pretrain_prefix_lm.py | 4 +--- tests/ds_config_inference.json | 15 +++++++++++++++ tests/test_model.py | 5 ++--- 3 files changed, 18 insertions(+), 6 deletions(-) create mode 100644 tests/ds_config_inference.json diff --git a/pretrain_prefix_lm.py b/pretrain_prefix_lm.py index f0f97d566..c531db863 100644 --- a/pretrain_prefix_lm.py +++ b/pretrain_prefix_lm.py @@ -47,12 +47,10 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed: - assert args.attn_mask_type == AttnMaskType.prefix - model = GPTModelPipe( num_tokentypes=0, parallel_output=True, - attn_mask_type=args.attn_mask_type + attn_mask_type=AttnMaskType.prefix ) # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize diff --git a/tests/ds_config_inference.json b/tests/ds_config_inference.json new file mode 100644 index 000000000..91314429e --- /dev/null +++ b/tests/ds_config_inference.json @@ -0,0 +1,15 @@ +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 16, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "zero_allow_untested_optimizer": false, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} diff --git a/tests/test_model.py b/tests/test_model.py index 62ca414ae..9548977f2 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -59,9 +59,8 @@ def get_default_args(test_file_dir: str): # DeepSpeed args "--deepspeed": "", - "--deepspeed_config": f"{test_file_dir}/ds_config.json", - "--zero-stage": "1", - "--deepspeed-activation-checkpointing": "" + "--deepspeed_config": f"{test_file_dir}/ds_config_inference.json", + "--zero-stage": "0", } From 6a95e25e9317b0f277be957842a8128de4757db0 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 17:32:51 +0200 Subject: [PATCH 217/297] Make it work via DS --- tests/test_model.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 9548977f2..39918c713 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -144,7 +144,7 @@ def test_gpt(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] + input_batch, loss_params = pretrain_gpt.get_batch_pipe({"text": token_ids}) # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) @@ -153,8 +153,8 @@ def test_gpt(self): input_token_ids_changed[:, changed_index] = \ (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size - output = model.train_batch(*input_batch) - output_changed = model.train_batch(input_token_ids_changed, *input_batch[1:]) + output = model.train_batch(input_batch, loss_params) + output_changed = model.train_batch((input_token_ids_changed, *input_batch[1:]), loss_params) # All token in past should be unchanged torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index]) @@ -192,7 +192,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch to have non empty prefix - input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) + input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): for id in prefix_indices[batch_id]: @@ -201,7 +201,7 @@ def test_prefix_lm_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - output = model.train_batch(*input_batch) + output = model.train_batch(input_batch, (labels, loss_mask)) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -216,7 +216,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model.train_batch(token_ids_changed_target, *input_batch[1:]) + output_changed_target = model.train_batch((token_ids_changed_target, *input_batch[1:]), (labels, loss_mask)) # All token in past should be unchanged torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index]) @@ -241,7 +241,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model.train_batch(token_ids_changed_input, *input_batch[1:]) + output_changed_input = model.train_batch((token_ids_changed_input, *input_batch[1:]), (labels, loss_mask)) # All tokens should be changed self.assertFalse( @@ -282,7 +282,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - model.train_batch(*input_batch) + model.train_batch(input_batch, (labels, loss_mask)) #TODO: Check all invariants @@ -312,7 +312,7 @@ def test_gpt_rotary_embeddings(self): # process batch input_batch = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] - model.train_batch(*input_batch) + model.train_batch(input_batch, (labels, loss_mask)) #TODO: Check all invariants @@ -381,7 +381,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) - output = model.train_batch(tokens, position_ids, attention_mask) + output = model.train_batch((tokens, position_ids, attention_mask), (labels, loss_mask)) ## --------------- CHANGE A TARGET TOKEN --------------------------- # change the first token in the first batch @@ -394,7 +394,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size # Test change - output_changed_target = model.train_batch(token_ids_changed_target, position_ids, attention_mask) + output_changed_target = model.train_batch((token_ids_changed_target, position_ids, attention_mask), (labels, loss_mask)) first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0] # Check that values changed in segment 1 of batch_id 0 From d6e435b1fd5ca7714342957b4606dfa6c934b6c1 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 17:36:55 +0200 Subject: [PATCH 218/297] Make it work via DS --- tests/test_model.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 39918c713..dfa3489ef 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -69,6 +69,10 @@ def equal_vectors(tensor1, tensor2, dim=-1): return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0 +def iter_out_of_one(one): + return iter([one]) + + def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vocab_size: int, special_tokens_ids: Set[int]): """Code from `tests/test_dataloaders.py""" seq_length += 1 @@ -153,8 +157,8 @@ def test_gpt(self): input_token_ids_changed[:, changed_index] = \ (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size - output = model.train_batch(input_batch, loss_params) - output_changed = model.train_batch((input_token_ids_changed, *input_batch[1:]), loss_params) + output = model.train_batch(iter_out_of_one(input_batch)) + output_changed = model.train_batch(iter_out_of_one((input_token_ids_changed, *input_batch[1:]))) # All token in past should be unchanged torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index]) @@ -201,7 +205,7 @@ def test_prefix_lm_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - output = model.train_batch(input_batch, (labels, loss_mask)) + output = model.train_batch(iter_out_of_one(input_batch)) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -216,7 +220,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model.train_batch((token_ids_changed_target, *input_batch[1:]), (labels, loss_mask)) + output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, *input_batch[1:]))) # All token in past should be unchanged torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index]) @@ -241,7 +245,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model.train_batch((token_ids_changed_input, *input_batch[1:]), (labels, loss_mask)) + output_changed_input = model.train_batch(iter_out_of_one((token_ids_changed_input, *input_batch[1:]))) # All tokens should be changed self.assertFalse( @@ -282,7 +286,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - model.train_batch(input_batch, (labels, loss_mask)) + model.train_batch(iter_out_of_one(input_batch)) #TODO: Check all invariants @@ -312,7 +316,7 @@ def test_gpt_rotary_embeddings(self): # process batch input_batch = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] - model.train_batch(input_batch, (labels, loss_mask)) + model.train_batch(iter_out_of_one(input_batch)) #TODO: Check all invariants @@ -381,7 +385,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) - output = model.train_batch((tokens, position_ids, attention_mask), (labels, loss_mask)) + output = model.train_batch(iter_out_of_one((tokens, position_ids, attention_mask))) ## --------------- CHANGE A TARGET TOKEN --------------------------- # change the first token in the first batch @@ -394,7 +398,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size # Test change - output_changed_target = model.train_batch((token_ids_changed_target, position_ids, attention_mask), (labels, loss_mask)) + output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, position_ids, attention_mask))) first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0] # Check that values changed in segment 1 of batch_id 0 From ca8c04a7ce84ea039e0629d942793920ab4fe32e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 17:47:09 +0200 Subject: [PATCH 219/297] Make it work via DS --- tests/test_model.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index dfa3489ef..544d736cc 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -140,6 +140,7 @@ def test_gpt(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] + model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -148,7 +149,7 @@ def test_gpt(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch, loss_params = pretrain_gpt.get_batch_pipe({"text": token_ids}) + input_batch, (labels, loss_mask) = pretrain_gpt.get_batch_pipe({"text": token_ids}) # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) @@ -157,8 +158,8 @@ def test_gpt(self): input_token_ids_changed[:, changed_index] = \ (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size - output = model.train_batch(iter_out_of_one(input_batch)) - output_changed = model.train_batch(iter_out_of_one((input_token_ids_changed, *input_batch[1:]))) + output = model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) + output_changed = model.train_batch(iter_out_of_one((input_token_ids_changed, *input_batch[1:])), (labels, loss_mask)) # All token in past should be unchanged torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index]) @@ -188,6 +189,7 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -205,7 +207,7 @@ def test_prefix_lm_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - output = model.train_batch(iter_out_of_one(input_batch)) + output = model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -220,7 +222,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, *input_batch[1:]))) + output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, *input_batch[1:])), (labels, loss_mask)) # All token in past should be unchanged torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index]) @@ -245,7 +247,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model.train_batch(iter_out_of_one((token_ids_changed_input, *input_batch[1:]))) + output_changed_input = model.train_batch(iter_out_of_one((token_ids_changed_input, *input_batch[1:])), (labels, loss_mask)) # All tokens should be changed self.assertFalse( @@ -275,9 +277,10 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) + input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): id = prefix_indices[batch_id] @@ -286,7 +289,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - model.train_batch(iter_out_of_one(input_batch)) + model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) #TODO: Check all invariants @@ -306,6 +309,7 @@ def test_gpt_rotary_embeddings(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] + model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -314,9 +318,9 @@ def test_gpt_rotary_embeddings(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] + input_batch, (labels, loss_mask) = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] - model.train_batch(iter_out_of_one(input_batch)) + model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) #TODO: Check all invariants @@ -382,10 +386,11 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i ) model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider) model = model[0] + model._compute_loss = False (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) - output = model.train_batch(iter_out_of_one((tokens, position_ids, attention_mask))) + output = model.train_batch(iter_out_of_one((tokens, position_ids, attention_mask)), (labels, loss_mask)) ## --------------- CHANGE A TARGET TOKEN --------------------------- # change the first token in the first batch @@ -398,7 +403,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size # Test change - output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, position_ids, attention_mask))) + output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, position_ids, attention_mask)), (labels, loss_mask)) first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0] # Check that values changed in segment 1 of batch_id 0 From f3231db3bcf6e3996258af4599b9fc1d701e910e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 17:56:38 +0200 Subject: [PATCH 220/297] Make it work via DS --- tests/test_model.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 544d736cc..8c72f3e4c 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -140,7 +140,6 @@ def test_gpt(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] - model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -149,7 +148,7 @@ def test_gpt(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch, (labels, loss_mask) = pretrain_gpt.get_batch_pipe({"text": token_ids}) + input_batch, _ = pretrain_gpt.get_batch_pipe({"text": token_ids}) # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) @@ -158,8 +157,8 @@ def test_gpt(self): input_token_ids_changed[:, changed_index] = \ (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size - output = model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) - output_changed = model.train_batch(iter_out_of_one((input_token_ids_changed, *input_batch[1:])), (labels, loss_mask)) + output = model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) + output_changed = model.eval_batch(iter_out_of_one((input_token_ids_changed, *input_batch[1:])), compute_loss=False) # All token in past should be unchanged torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index]) @@ -189,7 +188,6 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] - model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -198,7 +196,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch to have non empty prefix - input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) + input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): for id in prefix_indices[batch_id]: @@ -207,7 +205,7 @@ def test_prefix_lm_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - output = model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) + output = model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -222,7 +220,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, *input_batch[1:])), (labels, loss_mask)) + output_changed_target = model.eval_batch(iter_out_of_one((token_ids_changed_target, *input_batch[1:])), compute_loss=False) # All token in past should be unchanged torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index]) @@ -247,7 +245,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model.train_batch(iter_out_of_one((token_ids_changed_input, *input_batch[1:])), (labels, loss_mask)) + output_changed_input = model.eval_batch(iter_out_of_one((token_ids_changed_input, *input_batch[1:])), compute_loss=False) # All tokens should be changed self.assertFalse( @@ -277,10 +275,9 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] - model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) + input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): id = prefix_indices[batch_id] @@ -289,7 +286,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) + model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) #TODO: Check all invariants @@ -309,7 +306,6 @@ def test_gpt_rotary_embeddings(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] - model._compute_loss = False token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -318,9 +314,9 @@ def test_gpt_rotary_embeddings(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch, (labels, loss_mask) = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] + input_batch, _ = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] - model.train_batch(iter_out_of_one(input_batch), (labels, loss_mask)) + model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) #TODO: Check all invariants @@ -386,11 +382,10 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i ) model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider) model = model[0] - model._compute_loss = False - (tokens, position_ids, attention_mask), (labels, loss_mask) = finetune_t0_non_causal_decoder.get_batch_pipe(data) + (tokens, position_ids, attention_mask), _ = finetune_t0_non_causal_decoder.get_batch_pipe(data) - output = model.train_batch(iter_out_of_one((tokens, position_ids, attention_mask)), (labels, loss_mask)) + output = model.eval_batch(iter_out_of_one((tokens, position_ids, attention_mask)), compute_loss=False) ## --------------- CHANGE A TARGET TOKEN --------------------------- # change the first token in the first batch @@ -403,7 +398,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size # Test change - output_changed_target = model.train_batch(iter_out_of_one((token_ids_changed_target, position_ids, attention_mask)), (labels, loss_mask)) + output_changed_target = model.eval_batch(iter_out_of_one((token_ids_changed_target, position_ids, attention_mask)), compute_loss=False) first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0] # Check that values changed in segment 1 of batch_id 0 From 987e6b4bb8390990314a9e282eb860a59e52f0aa Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:13:54 +0200 Subject: [PATCH 221/297] Make it work via DS --- tests/test_model.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 8c72f3e4c..59ab09d2d 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -27,8 +27,8 @@ def get_default_args(test_file_dir: str): "--num-attention-heads": "4", "--seq-length": "256", "--max-position-embeddings": "256", - "--micro-batch-size": "4", - "--global-batch-size": "8", + "--micro-batch-size": "1", + "--global-batch-size": "1", "--lr-decay-iters": "320000", "--lr-decay-style": "cosine", "--lr": "0.00015", @@ -152,13 +152,13 @@ def test_gpt(self): # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) - input_token_ids_changed = input_batch[0].clone() + token_ids_changed = token_ids[0].clone() # We increment the token_id by one for that index in order to artificially change the sequence. - input_token_ids_changed[:, changed_index] = \ - (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size + token_ids_changed[:, changed_index] = \ + (token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size output = model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) - output_changed = model.eval_batch(iter_out_of_one((input_token_ids_changed, *input_batch[1:])), compute_loss=False) + output_changed = model.eval_batch(iter_out_of_one({"text": token_ids_changed}), compute_loss=False) # All token in past should be unchanged torch_assert_equal(output[:, :changed_index], output_changed[:, :changed_index]) @@ -188,6 +188,8 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + # we preprocess batch_fn manually + model._megatron_batch_fn = None token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -275,6 +277,8 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + # we preprocess batch_fn manually + model._megatron_batch_fn = None token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) @@ -313,10 +317,7 @@ def test_gpt_rotary_embeddings(self): token_ids[token_ids == tokenizer.eod] += 1 token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size - # process batch - input_batch, _ = pretrain_gpt.get_batch_pipe({"text": token_ids})[0] - - model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) + model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False) #TODO: Check all invariants @@ -383,22 +384,20 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider) model = model[0] - (tokens, position_ids, attention_mask), _ = finetune_t0_non_causal_decoder.get_batch_pipe(data) - - output = model.eval_batch(iter_out_of_one((tokens, position_ids, attention_mask)), compute_loss=False) + output = model.eval_batch(iter_out_of_one(data), compute_loss=False) ## --------------- CHANGE A TARGET TOKEN --------------------------- # change the first token in the first batch change_batch_id = 0 change_token_id = 0 - token_ids_changed_target = tokens[0].clone() + token_ids_changed = data["decoder_token_ids"][0].clone() # We increment the token id on the changed index. - token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size - while token_ids_changed_target[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}: - token_ids_changed_target[change_batch_id, change_token_id] = (token_ids_changed_target[change_batch_id, change_token_id] + 1) % args.padded_vocab_size + token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size + while token_ids_changed[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}: + token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size # Test change - output_changed_target = model.eval_batch(iter_out_of_one((token_ids_changed_target, position_ids, attention_mask)), compute_loss=False) + output_changed_target = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed}), compute_loss=False) first_segment_first_batch_id_end = (torch.nonzero(data["decoder_segment_ids"][change_batch_id, 1:] - data["decoder_segment_ids"][change_batch_id, :-1]) + 1)[0] # Check that values changed in segment 1 of batch_id 0 From 0b27fb670f7e774d05135dcda67543156168fc59 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:21:00 +0200 Subject: [PATCH 222/297] Make it work via DS --- tests/test_model.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 59ab09d2d..1d199c629 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -147,9 +147,6 @@ def test_gpt(self): token_ids[token_ids == tokenizer.eod] += 1 token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size - # process batch - input_batch, _ = pretrain_gpt.get_batch_pipe({"text": token_ids}) - # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) token_ids_changed = token_ids[0].clone() @@ -157,7 +154,7 @@ def test_gpt(self): token_ids_changed[:, changed_index] = \ (token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size - output = model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) + output = model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False) output_changed = model.eval_batch(iter_out_of_one({"text": token_ids_changed}), compute_loss=False) # All token in past should be unchanged @@ -189,7 +186,7 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] # we preprocess batch_fn manually - model._megatron_batch_fn = None + model.set_batch_fn(None) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -278,7 +275,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] # we preprocess batch_fn manually - model._megatron_batch_fn = None + model.set_batch_fn(None) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) From 1ba5d4a13d937fe8fbe83c1d5579f6f57242cafd Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:24:47 +0200 Subject: [PATCH 223/297] Woops --- tests/test_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 1d199c629..578e3a389 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -149,10 +149,10 @@ def test_gpt(self): # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) - token_ids_changed = token_ids[0].clone() + token_ids_changed = token_ids.clone() # We increment the token_id by one for that index in order to artificially change the sequence. token_ids_changed[:, changed_index] = \ - (token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size + (token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size output = model.eval_batch(iter_out_of_one({"text": token_ids}), compute_loss=False) output_changed = model.eval_batch(iter_out_of_one({"text": token_ids_changed}), compute_loss=False) From cbab16ca7377ed17b20cdd087d05982ed5f35b7e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:31:32 +0200 Subject: [PATCH 224/297] Make it work via DS --- tests/test_model.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_model.py b/tests/test_model.py index 578e3a389..62fed2824 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -140,6 +140,7 @@ def test_gpt(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] + model.set_train_batch_size(1) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -185,6 +186,7 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + model.set_train_batch_size(1) # we preprocess batch_fn manually model.set_batch_fn(None) @@ -274,6 +276,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + model.set_train_batch_size(1) # we preprocess batch_fn manually model.set_batch_fn(None) @@ -307,6 +310,7 @@ def test_gpt_rotary_embeddings(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] + model.set_train_batch_size(1) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -380,6 +384,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i ) model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider) model = model[0] + model.set_train_batch_size(1) output = model.eval_batch(iter_out_of_one(data), compute_loss=False) From 4defbb2c70d5254ac745d1c5599734e82f29e8ac Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:38:27 +0200 Subject: [PATCH 225/297] Make it work via DS --- finetune_t0_non_causal_decoder.py | 18 +++++++++--------- megatron/data/data_samplers.py | 2 +- tests/test_dataloaders.py | 12 ++++++------ tests/test_model.py | 14 +++++++------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 4b8717f8a..f203b4661 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -23,13 +23,13 @@ def record(fn): def model_provider(pre_process=True, post_process=True): """Build the model.""" - print_rank_0('building GPT model ...') + print_rank_0("building GPT model ...") see_memory_usage(f"Before Building Model", force=True) args = get_args() with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), - remote_device=None if args.remote_device == 'none' else args.remote_device, + remote_device=None if args.remote_device == "none" else args.remote_device, config_dict_or_path=args.deepspeed_config, enabled=args.zero_stage == 3, mpu=mpu): @@ -60,16 +60,16 @@ def get_batch_pipe(data): tokenizer = get_tokenizer() # Broadcast data. - data_b = mpu.broadcast_data(['decoder_tokens', 'decoder_segment_ids'], data, torch.int64) - data_c = mpu.broadcast_data(['decoder_is_inputs'], data, torch.bool) + data_b = mpu.broadcast_data(["decoder_token_ids", "decoder_segment_ids"], data, torch.int64) + data_c = mpu.broadcast_data(["decoder_is_inputs"], data, torch.bool) # Unpack. - tokens_ = data_b['decoder_tokens'].long() + tokens_ = data_b["decoder_token_ids"].long() labels = tokens_[:, 1:].contiguous() tokens = tokens_[:, :-1].contiguous() - segment_ids = data_b['decoder_segment_ids'].long()[:, :-1] - decoder_is_inputs = data_c['decoder_is_inputs'][:, :-1] + segment_ids = data_b["decoder_segment_ids"].long()[:, :-1] + decoder_is_inputs = data_c["decoder_is_inputs"][:, :-1] # Get the masks and position ids. causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( @@ -82,7 +82,7 @@ def get_batch_pipe(data): loss_on_targets_only=False # This is done below ) # Only compute loss over causal target tokens, i.e. ignore input_tokens & padding - loss_on_targets_only = ~data_c['decoder_is_inputs'][:, 1:] + loss_on_targets_only = ~data_c["decoder_is_inputs"][:, 1:] loss_on_non_pad_only = (tokens != tokenizer.pad) loss_mask *= loss_on_targets_only * loss_on_non_pad_only @@ -105,7 +105,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): args = get_args() train_ds, valid_ds, test_ds = None, None, None - print_rank_0('> building train, validation, and test datasets for T0 ...') + print_rank_0("> building train, validation, and test datasets for T0 ...") # Option 1 of data loading using --data-path # For T0, data has to be provided in the form --data-path input-data target-data input-data2 target-data2 ... if args.data_path: diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index b6bbdf812..3e7df32d3 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -81,7 +81,7 @@ def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int) # Normally the default collate_fn handles torch tensor conversion; As we use a custom collate_fn, do it here return { - "decoder_tokens": decoder_tokens, + "decoder_token_ids": decoder_tokens, "decoder_segment_ids": decoder_segment_ids, "decoder_is_inputs": decoder_is_inputs, } diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 38acfee38..867641906 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -82,7 +82,7 @@ def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vo token = flatten_token_view[token_id] return { - "decoder_tokens": tokens, + "decoder_token_ids": tokens, "decoder_segment_ids": segment_ids, "decoder_is_inputs": is_inputs } @@ -212,10 +212,10 @@ def test_mtf_packed_dataloader(self): last_padding_size = 0 for i, items in enumerate(batch_sampler): - micro_batch_size, seq_length = items["decoder_tokens"].shape + micro_batch_size, seq_length = items["decoder_token_ids"].shape # Check dtypes - self.assertEqual(items["decoder_tokens"].dtype, torch.int64) + self.assertEqual(items["decoder_token_ids"].dtype, torch.int64) self.assertEqual(items["decoder_segment_ids"].dtype, torch.int64) self.assertEqual(items["decoder_is_inputs"].dtype, torch.bool) @@ -275,9 +275,9 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): loss_mask = loss_mask.cpu() self.assertEqual(loss_mask.dtype, torch.float) - torch_assert_equal(loss_mask.bool(), ~data["decoder_is_inputs"][:, 1:] * (data["decoder_tokens"] != tokenizer.pad)) - torch_assert_equal(tokens, data["decoder_tokens"][:, :-1]) - torch_assert_equal(labels, data["decoder_tokens"][:, 1:]) + torch_assert_equal(loss_mask.bool(), ~data["decoder_is_inputs"][:, 1:] * (data["decoder_token_ids"] != tokenizer.pad)) + torch_assert_equal(tokens, data["decoder_token_ids"][:, :-1]) + torch_assert_equal(labels, data["decoder_token_ids"][:, 1:]) # TODO @thomasw21 check that attention_mask is `1` between segments, ie segments are independent for batch_id in range(args.micro_batch_size): diff --git a/tests/test_model.py b/tests/test_model.py index 62fed2824..294848faa 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -105,7 +105,7 @@ def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vo token = flatten_token_view[token_id] return { - "decoder_tokens": tokens, + "decoder_token_ids": tokens, "decoder_segment_ids": segment_ids, "decoder_is_inputs": is_inputs } @@ -197,7 +197,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch to have non empty prefix - input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) + input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): for id in prefix_indices[batch_id]: @@ -206,7 +206,7 @@ def test_prefix_lm_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - output = model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) + output = model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -221,7 +221,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model.eval_batch(iter_out_of_one((token_ids_changed_target, *input_batch[1:])), compute_loss=False) + output_changed_target = model.eval_batch(iter_out_of_one(((token_ids_changed_target, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False) # All token in past should be unchanged torch_assert_equal(output[0, :changed_target_index], output_changed_target[0, :changed_target_index]) @@ -246,7 +246,7 @@ def test_prefix_lm_reset_attention_mask(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model.eval_batch(iter_out_of_one((token_ids_changed_input, *input_batch[1:])), compute_loss=False) + output_changed_input = model.eval_batch(iter_out_of_one(((token_ids_changed_input, *input_batch[1:]), (labels, loss_mask), prefix_indices)), compute_loss=False) # All tokens should be changed self.assertFalse( @@ -281,7 +281,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): model.set_batch_fn(None) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - input_batch, (_, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) + input_batch, (labels, loss_mask), prefix_indices = pretrain_prefix_lm.get_batch_pipe({"text": token_ids}) for batch_id in range(len(prefix_indices)): id = prefix_indices[batch_id] @@ -290,7 +290,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): # Make sure that the last prefix token predicts the first token. self.assertTrue(loss_mask[batch_id, id -1] == 1) - model.eval_batch(iter_out_of_one(input_batch), compute_loss=False) + model.eval_batch(iter_out_of_one((input_batch, (labels, loss_mask), prefix_indices)), compute_loss=False) #TODO: Check all invariants From 412939c03658b72bbd8139f2ad0d1bae6449bfc5 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:40:59 +0200 Subject: [PATCH 226/297] Make it work via DS --- tests/test_model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 294848faa..9d65e38c3 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -27,8 +27,8 @@ def get_default_args(test_file_dir: str): "--num-attention-heads": "4", "--seq-length": "256", "--max-position-embeddings": "256", - "--micro-batch-size": "1", - "--global-batch-size": "1", + "--micro-batch-size": "2", + "--global-batch-size": "2", "--lr-decay-iters": "320000", "--lr-decay-style": "cosine", "--lr": "0.00015", @@ -140,7 +140,7 @@ def test_gpt(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] - model.set_train_batch_size(1) + model.set_train_batch_size(args.micro_batch_size) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -186,7 +186,7 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] - model.set_train_batch_size(1) + model.set_train_batch_size(args.micro_batch_size) # we preprocess batch_fn manually model.set_batch_fn(None) @@ -276,7 +276,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] - model.set_train_batch_size(1) + model.set_train_batch_size(args.micro_batch_size) # we preprocess batch_fn manually model.set_batch_fn(None) @@ -310,7 +310,7 @@ def test_gpt_rotary_embeddings(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] - model.set_train_batch_size(1) + model.set_train_batch_size(args.micro_batch_size) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -384,7 +384,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i ) model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider) model = model[0] - model.set_train_batch_size(1) + model.set_train_batch_size(args.micro_batch_size) output = model.eval_batch(iter_out_of_one(data), compute_loss=False) From 17a6cc0a132720e66aaf1a4f62800403179f03ea Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 18:44:35 +0200 Subject: [PATCH 227/297] Maybe --- tests/test_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 9d65e38c3..ec32c555c 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -186,8 +186,8 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] - model.set_train_batch_size(args.micro_batch_size) # we preprocess batch_fn manually + model.set_train_batch_size(1) model.set_batch_fn(None) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -276,8 +276,8 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] - model.set_train_batch_size(args.micro_batch_size) # we preprocess batch_fn manually + model.set_train_batch_size(1) model.set_batch_fn(None) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) From cb90679e501f1019f53ec63fcf15d43b47cb8e37 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:04:48 +0200 Subject: [PATCH 228/297] Make it work via DS --- tests/test_model.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index ec32c555c..3662677a3 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -140,6 +140,7 @@ def test_gpt(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] + model._config.train_micro_batch_size_per_gpu = args.micro_batch_size model.set_train_batch_size(args.micro_batch_size) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -186,8 +187,9 @@ def test_prefix_lm_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + model._config.train_micro_batch_size_per_gpu = args.micro_batch_size + model.set_train_batch_size(args.micro_batch_size) # we preprocess batch_fn manually - model.set_train_batch_size(1) model.set_batch_fn(None) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -276,8 +278,9 @@ def test_prefix_lm_wo_reset_attention_mask(self): model, _, _ = setup_model_and_optimizer(pretrain_prefix_lm.model_provider) model = model[0] + model._config.train_micro_batch_size_per_gpu = args.micro_batch_size + model.set_train_batch_size(args.micro_batch_size) # we preprocess batch_fn manually - model.set_train_batch_size(1) model.set_batch_fn(None) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -310,6 +313,7 @@ def test_gpt_rotary_embeddings(self): model, _, _ = setup_model_and_optimizer(pretrain_gpt.model_provider) model = model[0] + model._config.train_micro_batch_size_per_gpu = args.micro_batch_size model.set_train_batch_size(args.micro_batch_size) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -384,6 +388,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i ) model, _, _ = setup_model_and_optimizer(finetune_t0_non_causal_decoder.model_provider) model = model[0] + model._config.train_micro_batch_size_per_gpu = args.micro_batch_size model.set_train_batch_size(args.micro_batch_size) output = model.eval_batch(iter_out_of_one(data), compute_loss=False) From bd4a3f07dc5ca20513e5744d53938ce24296279c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:07:13 +0200 Subject: [PATCH 229/297] Woops --- tests/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index 3662677a3..fcf4b9897 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -397,7 +397,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i # change the first token in the first batch change_batch_id = 0 change_token_id = 0 - token_ids_changed = data["decoder_token_ids"][0].clone() + token_ids_changed = data["decoder_token_ids"].clone() # We increment the token id on the changed index. token_ids_changed[change_batch_id, change_token_id] = (token_ids_changed[change_batch_id, change_token_id] + 1) % args.padded_vocab_size while token_ids_changed[change_batch_id, change_token_id] in {tokenizer.eod, tokenizer.pad}: From 6604035402aa85ff2c96a55fd92b9f0b08cea293 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:18:50 +0200 Subject: [PATCH 230/297] Try having very strict mask --- megatron/model/utils.py | 4 ++-- tests/test_model.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 8c3908a93..18f008de8 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -48,9 +48,9 @@ def attention_mask_func(attention_scores, attention_mask): if actual_seqlen != attention_mask_.size()[2]: # attention_mask has size [1, 1, seqlen, seqlen] attention_mask_ = attention_mask_[:, :, :actual_seqlen, :actual_seqlen].contiguous() - attention_scores.masked_fill_(attention_mask_, -10000.0) + attention_scores.masked_fill_(attention_mask_, torch.finfo(attention_scores.dtype).min) else: - attention_scores.masked_fill_(attention_mask, -10000.0) + attention_scores.masked_fill_(attention_mask, torch.finfo(attention_scores.dtype).min) return attention_scores diff --git a/tests/test_model.py b/tests/test_model.py index fcf4b9897..c3003cc06 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -11,7 +11,8 @@ from packaging import version from megatron import initialize_megatron, get_args, get_tokenizer, global_vars -from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal +from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal, \ + torch_assert_close from megatron.training import setup_model_and_optimizer import pretrain_gpt import pretrain_prefix_lm From d98e39a501668244687c2a3a2d09d1c829dd9ca2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:25:47 +0200 Subject: [PATCH 231/297] Try updating the kernel --- megatron/fused_kernels/scaled_masked_softmax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index e57fd04c6..f0814c9b0 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -269,7 +269,7 @@ __global__ void scaled_masked_softmax_warp_forward( if (temp_mask[element] != 1) { elements[i][it + element] = (acc_t)temp_data[element] * scale; } else { - elements[i][it + element] = -10000.0; + elements[i][it + element] = -std::numeric_limits::infinity() ; } } } else { From 849508341eb110b3df6aa323950a8bab0da9adff Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:29:05 +0200 Subject: [PATCH 232/297] Try updating the kernel --- megatron/fused_kernels/scaled_masked_softmax.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index f0814c9b0..9954674dd 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -298,7 +298,11 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - elements[i][it] = std::exp((elements[i][it] - max_value[i])); + if (elements[i][it] == -std::numeric_limits::infinity()) { + elements[i][it] = 0; + } else { + elements[i][it] = std::exp((elements[i][it] - max_value[i])); + } sum[i] += elements[i][it]; } } From ef5d4d4db314d74049369f1322d343f96b289769 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:34:55 +0200 Subject: [PATCH 233/297] Try updating the kernel --- megatron/fused_kernels/scaled_masked_softmax.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 9954674dd..4656adbde 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -298,7 +298,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] == -std::numeric_limits::infinity()) { + if (elements[i][it] < -std::numeric_limits::infinity() + max_value[i]) { elements[i][it] = 0; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); @@ -320,7 +320,12 @@ __global__ void scaled_masked_softmax_warp_forward( if (element_index < element_count) { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { - out[element] = elements[i][it + element] / sum[i]; + if (sum[i] == 0) { + // Essentially the whole square is 0 + out[element] = 0; + } else { + out[element] = elements[i][it + element] / sum[i]; + } } copy_vector(dst + i * element_count + it * WARP_SIZE, out); } else { From 69912b3f9a2c5bba4f84fdd00f0d7eda31bfbdc4 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:40:53 +0200 Subject: [PATCH 234/297] Try updating the kernel --- megatron/fused_kernels/scaled_masked_softmax.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 4656adbde..147145878 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -299,7 +299,7 @@ __global__ void scaled_masked_softmax_warp_forward( #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { if (elements[i][it] < -std::numeric_limits::infinity() + max_value[i]) { - elements[i][it] = 0; + elements[i][it] = 0.0f; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); } @@ -320,12 +320,7 @@ __global__ void scaled_masked_softmax_warp_forward( if (element_index < element_count) { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { - if (sum[i] == 0) { - // Essentially the whole square is 0 - out[element] = 0; - } else { - out[element] = elements[i][it + element] / sum[i]; - } + out[element] = elements[i][it + element] / sum[i]; } copy_vector(dst + i * element_count + it * WARP_SIZE, out); } else { From 866fc56e5fa9208a5f9e3a355b929547edaa0330 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:54:26 +0200 Subject: [PATCH 235/297] Try updating the kernel --- megatron/fused_kernels/scaled_masked_softmax.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 147145878..c9479de3a 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -269,7 +269,7 @@ __global__ void scaled_masked_softmax_warp_forward( if (temp_mask[element] != 1) { elements[i][it + element] = (acc_t)temp_data[element] * scale; } else { - elements[i][it + element] = -std::numeric_limits::infinity() ; + elements[i][it + element] = -std::numeric_limits::infinity(); } } } else { @@ -298,7 +298,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] < -std::numeric_limits::infinity() + max_value[i]) { + if (elements[i][it] == -std::numeric_limits::infinity()) { elements[i][it] = 0.0f; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); From 8e9701b34d55d547c632f4b92b38f6bf4bfb6d7e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 20:00:13 +0200 Subject: [PATCH 236/297] Try updating the kernel --- megatron/fused_kernels/scaled_masked_softmax.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index c9479de3a..b0e0d2edf 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -269,7 +269,7 @@ __global__ void scaled_masked_softmax_warp_forward( if (temp_mask[element] != 1) { elements[i][it + element] = (acc_t)temp_data[element] * scale; } else { - elements[i][it + element] = -std::numeric_limits::infinity(); + elements[i][it + element] = std::numeric_limits::min(); } } } else { @@ -298,7 +298,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] == -std::numeric_limits::infinity()) { + if (elements[i][it] == std::numeric_limits::min()) { elements[i][it] = 0.0f; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); From 15d95faf3d8366739abe22321ac3c33aa5a62a84 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 20:56:41 +0200 Subject: [PATCH 237/297] Inverse causal masking --- finetune_t0_non_causal_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index f203b4661..cb6776b19 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -89,7 +89,7 @@ def get_batch_pipe(data): attention_mask = get_packed_attention_mask( # Run non-causal decoder is_causal=False, - causal_mask=causal_mask.bool(), + causal_mask=~(causal_mask.bool()), decoder_is_inputs=decoder_is_inputs.bool(), segment_ids=segment_ids.long(), ) From fe4f806cdda7e8c4a50f487625ad949d86c52396 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 21:09:14 +0200 Subject: [PATCH 238/297] Check that the padding are ignored --- megatron/fused_kernels/scaled_masked_softmax.h | 9 ++++++--- tests/test_model.py | 18 ++++++++++++++++-- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index b0e0d2edf..9d8e41353 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -269,7 +269,7 @@ __global__ void scaled_masked_softmax_warp_forward( if (temp_mask[element] != 1) { elements[i][it + element] = (acc_t)temp_data[element] * scale; } else { - elements[i][it + element] = std::numeric_limits::min(); + elements[i][it + element] = -std::numeric_limits::infinity(); } } } else { @@ -298,7 +298,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] == std::numeric_limits::min()) { + if (elements[i][it] < -std::numeric_limits::infinity()) { elements[i][it] = 0.0f; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); @@ -309,9 +309,12 @@ __global__ void scaled_masked_softmax_warp_forward( warp_reduce(sum); // store result - output_t out[ELEMENTS_PER_LDG_STG]; + output_t out[ELEMENTS_PER_LDG_STG] { 0.0f }; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { + if (sum[i] == 0.0) { + break; + } if (i >= local_batches) break; #pragma unroll diff --git a/tests/test_model.py b/tests/test_model.py index c3003cc06..21c07e535 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -88,7 +88,7 @@ def get_dummy_mtf_decoder_packed_data(micro_batch_size: int, seq_length: int, vo segment_ids[batch_id, start_new_segments] = 1 end_inputs = [ - torch.randint(low=start_segment, high=end_segment, size=()) + torch.randint(low=start_segment, high=end_segment - 1, size=()) for start_segment, end_segment in zip([0, *start_new_segments], [*start_new_segments, seq_length]) ] for end_input, start_segment in zip(end_inputs, [0, *start_new_segments]): @@ -395,7 +395,7 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i output = model.eval_batch(iter_out_of_one(data), compute_loss=False) ## --------------- CHANGE A TARGET TOKEN --------------------------- - # change the first token in the first batch + # change the first token in the first batch to a random value change_batch_id = 0 change_token_id = 0 token_ids_changed = data["decoder_token_ids"].clone() @@ -424,3 +424,17 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i torch_assert_equal(output[:change_batch_id:], output_changed_target[:change_batch_id]) if change_batch_id + 1 < len(output): torch_assert_equal(output[change_batch_id + 1:], output_changed_target[change_batch_id + 1:]) + + ## --------------- CHANGE A TARGET TOKEN --------------------------- + # change the last token in the first batch to a pad + token_ids_changed_pad = data["decoder_token_ids"].clone() + segment_ids_changed_pad = data["decoder_segment_ids"].clone() + # We increment the token id on the changed index. + token_ids_changed_pad[change_batch_id, -1] = tokenizer.pad + segment_ids_changed_pad[change_batch_id, -1] = 0 + + # Test model handles padding correctly + output_changed_pad = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed_pad, "decoder_segment_ids": segment_ids_changed_pad}), compute_loss=False) + + print(output_changed_pad) + self.assertFalse(torch.isnan(output_changed_pad)) From cc2aff57d3efe98f833f757a233340118a2ae810 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 21:13:01 +0200 Subject: [PATCH 239/297] Fix test --- tests/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index 21c07e535..1efed8ef9 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -437,4 +437,4 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i output_changed_pad = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed_pad, "decoder_segment_ids": segment_ids_changed_pad}), compute_loss=False) print(output_changed_pad) - self.assertFalse(torch.isnan(output_changed_pad)) + self.assertFalse(torch.any(torch.isnan(output_changed_pad))) From 93cde870484fa039fa136078c438e2e9a0bdeb45 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 22:11:32 +0200 Subject: [PATCH 240/297] Probably should be in this order: --- megatron/fused_kernels/scaled_masked_softmax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 9d8e41353..01f708fad 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -298,7 +298,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] < -std::numeric_limits::infinity()) { + if (elements[i][it] > -std::numeric_limits::infinity()) { elements[i][it] = 0.0f; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); From f6d717b4ba2c4d1a968112d4ca13c50c3f3b6aea Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 22:17:45 +0200 Subject: [PATCH 241/297] Revert "Probably should be in this order:" This reverts commit 93cde870484fa039fa136078c438e2e9a0bdeb45. --- megatron/fused_kernels/scaled_masked_softmax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 01f708fad..9d8e41353 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -298,7 +298,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] > -std::numeric_limits::infinity()) { + if (elements[i][it] < -std::numeric_limits::infinity()) { elements[i][it] = 0.0f; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); From 910f93b9d472df4d67f95da56400a07a2f94d18b Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 1 Jul 2022 22:32:56 +0200 Subject: [PATCH 242/297] Add a test checking that ScaledMaskedSoftmax custom kernel does what we expect --- tests/test_model.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/test_model.py b/tests/test_model.py index 1efed8ef9..121891b73 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -11,6 +11,7 @@ from packaging import version from megatron import initialize_megatron, get_args, get_tokenizer, global_vars +from megatron.model.fused_softmax import FusedScaleMaskSoftmax, ScaledMaskedSoftmax from megatron.testing_utils import TestCasePlus, mockenv_context, flatten_arguments, torch_assert_equal, \ torch_assert_close from megatron.training import setup_model_and_optimizer @@ -364,6 +365,45 @@ def test_fused_layer_norm(self): torch_assert_equal(mfln_output, torch_layer_norm_output) + def test_fused_masked_sofmax(self): + command_args = get_default_args(self.test_file_dir_str) + + with patch('sys.argv', flatten_arguments(command_args)): + with mockenv_context(**self.dist_env_1_gpu): + initialize_megatron() + args = get_args() + + dummy_input = torch.randn( + args.micro_batch_size, + args.num_attention_heads, + args.seq_length, + args.seq_length, + device="cuda", + dtype=args.params_dtype + ) + dummy_attention_mask = torch.randn( + args.micro_batch_size, + args.num_attention_heads, + args.seq_length, + args.seq_length, + device="cuda", + dtype=args.params_dtype + ) < 0 + scale = torch.rand(()) + + fused_scaled_softmax = ScaledMaskedSoftmax + + fused_output = fused_scaled_softmax.apply(dummy_input, dummy_attention_mask, scale) + + # mimick the same via torch + output = scale * dummy_input + output = output.masked_fill(dummy_attention_mask, -10000) + output = F.softmax(output, dim=-1) + + # Issue is we use -10000 in mimicking instead of `inf` + torch_assert_close(fused_output, output) + + def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self): # TODO @thomasw21 make sure that if pass a causal mask, it is take in account. The following shows that fused_kernel completely ignores the masking is we set the variable incorrectly. # https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/131bd43e9f3552f2413a442f51c22214d4f6fb19/megatron/model/fused_softmax.py#L190 From 75f99ef7258a7eccd99ac099a72a7581aa9e7af2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 00:45:42 +0200 Subject: [PATCH 243/297] Head specific mask is not implemented --- tests/test_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 121891b73..967b1182c 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -365,7 +365,7 @@ def test_fused_layer_norm(self): torch_assert_equal(mfln_output, torch_layer_norm_output) - def test_fused_masked_sofmax(self): + def test_fused_masked_softmax(self): command_args = get_default_args(self.test_file_dir_str) with patch('sys.argv', flatten_arguments(command_args)): @@ -383,7 +383,7 @@ def test_fused_masked_sofmax(self): ) dummy_attention_mask = torch.randn( args.micro_batch_size, - args.num_attention_heads, + 1, # `args.num_attention_heads` not implemented in our cuda kernel args.seq_length, args.seq_length, device="cuda", From c34f107322bb09afd4722ea619a99acfd7ac9ba2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 02:32:27 +0200 Subject: [PATCH 244/297] Test something out --- .../fused_kernels/scaled_masked_softmax.h | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 9d8e41353..7876e7600 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -225,7 +225,7 @@ __global__ void scaled_masked_softmax_warp_forward( constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4; // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) - // gridDim/blockIdx = (seq_len, attn_heads, batches) + // gridDim/blockIdx = (seq_len, attn_heads, batches) int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH; int pad_first_batch = 0; if (pad_batches != 1) { // bert style @@ -234,6 +234,9 @@ __global__ void scaled_masked_softmax_warp_forward( pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; } + int local_seq = blockIdx.x + 1; + int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE; + // micro_batch_size might not be a multiple of WARP_BATCH. Check how // many batches have to computed within this WARP. int local_batches = micro_batch_size - first_batch; @@ -253,7 +256,7 @@ __global__ void scaled_masked_softmax_warp_forward( uint8_t temp_mask[ELEMENTS_PER_LDG_STG]; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { - int batch_element_count = (i >= local_batches) ? 0 : element_count; + int batch_element_count = (i >= local_batches) ? 0 : local_seq; #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { @@ -267,7 +270,7 @@ __global__ void scaled_masked_softmax_warp_forward( #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { if (temp_mask[element] != 1) { - elements[i][it + element] = (acc_t)temp_data[element] * scale; + elements[i][it + element] = (acc_t)temp_data[element] * (acc_t)scale; } else { elements[i][it + element] = -std::numeric_limits::infinity(); } @@ -298,12 +301,14 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] < -std::numeric_limits::infinity()) { - elements[i][it] = 0.0f; - } else { - elements[i][it] = std::exp((elements[i][it] - max_value[i])); + if (it < warp_iteration_limit) { + if (elements[i][it] < -std::numeric_limits::infinity()) { + elements[i][it] = 0.0f; + } else { + elements[i][it] = std::exp((elements[i][it] - max_value[i])); + } + sum[i] += elements[i][it]; } - sum[i] += elements[i][it]; } } warp_reduce(sum); @@ -325,7 +330,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { out[element] = elements[i][it + element] / sum[i]; } - copy_vector(dst + i * element_count + it * WARP_SIZE, out); + copy_vector(dst + i * element_count + it * WARP_SIZE, out); } else { break; } From ed6131aad2aa2074b73baf154e2c84dade031a6d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 02:43:53 +0200 Subject: [PATCH 245/297] Test something out --- megatron/fused_kernels/scaled_masked_softmax.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 7876e7600..db4d1146f 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -270,7 +270,7 @@ __global__ void scaled_masked_softmax_warp_forward( #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { if (temp_mask[element] != 1) { - elements[i][it + element] = (acc_t)temp_data[element] * (acc_t)scale; + elements[i][it + element] = (acc_t)temp_data[element] * scale; } else { elements[i][it + element] = -std::numeric_limits::infinity(); } @@ -325,10 +325,14 @@ __global__ void scaled_masked_softmax_warp_forward( #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; - if (element_index < element_count) { + if (element_index < local_seq) { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { - out[element] = elements[i][it + element] / sum[i]; + if (sum[i] == 0.0) { + out[element] = 0.0; + } else { + out[element] = elements[i][it + element] / sum[i]; + } } copy_vector(dst + i * element_count + it * WARP_SIZE, out); } else { From 3a846a0a93171bded9884e8582b7af1e5230c674 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 02:46:57 +0200 Subject: [PATCH 246/297] Test something out --- .../fused_kernels/scaled_masked_softmax.h | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index db4d1146f..14fc3c977 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -234,8 +234,8 @@ __global__ void scaled_masked_softmax_warp_forward( pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; } - int local_seq = blockIdx.x + 1; - int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE; +// int local_seq = blockIdx.x + 1; +// int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE; // micro_batch_size might not be a multiple of WARP_BATCH. Check how // many batches have to computed within this WARP. @@ -256,7 +256,7 @@ __global__ void scaled_masked_softmax_warp_forward( uint8_t temp_mask[ELEMENTS_PER_LDG_STG]; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { - int batch_element_count = (i >= local_batches) ? 0 : local_seq; + int batch_element_count = (i >= local_batches) ? 0 : element_count; #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { @@ -301,14 +301,12 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (it < warp_iteration_limit) { - if (elements[i][it] < -std::numeric_limits::infinity()) { - elements[i][it] = 0.0f; - } else { - elements[i][it] = std::exp((elements[i][it] - max_value[i])); - } - sum[i] += elements[i][it]; + if (elements[i][it] < -std::numeric_limits::infinity()) { + elements[i][it] = 0.0f; + } else { + elements[i][it] = std::exp((elements[i][it] - max_value[i])); } + sum[i] += elements[i][it]; } } warp_reduce(sum); @@ -325,7 +323,7 @@ __global__ void scaled_masked_softmax_warp_forward( #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; - if (element_index < local_seq) { + if (element_index < element_count) { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { if (sum[i] == 0.0) { From 5746641ee652ec8aac76280a9f4ffffa5382f50f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 02:58:13 +0200 Subject: [PATCH 247/297] Test something out --- megatron/fused_kernels/scaled_masked_softmax.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 14fc3c977..e12bfb318 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -256,7 +256,7 @@ __global__ void scaled_masked_softmax_warp_forward( uint8_t temp_mask[ELEMENTS_PER_LDG_STG]; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { - int batch_element_count = (i >= local_batches) ? 0 : element_count; + int batch_element_count = (i >= local_batches) ? 0 : local_seq; #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { @@ -316,6 +316,7 @@ __global__ void scaled_masked_softmax_warp_forward( #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { if (sum[i] == 0.0) { + copy_zero_vector(dst + i * element_count * stride); break; } if (i >= local_batches) @@ -326,11 +327,7 @@ __global__ void scaled_masked_softmax_warp_forward( if (element_index < element_count) { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { - if (sum[i] == 0.0) { - out[element] = 0.0; - } else { - out[element] = elements[i][it + element] / sum[i]; - } + out[element] = elements[i][it + element] / sum[i]; } copy_vector(dst + i * element_count + it * WARP_SIZE, out); } else { From 292620c48d2c1f4adcee4a1b7be61a219553f367 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 03:01:53 +0200 Subject: [PATCH 248/297] Test something out --- megatron/fused_kernels/scaled_masked_softmax.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index e12bfb318..f6cf0ce32 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -256,13 +256,13 @@ __global__ void scaled_masked_softmax_warp_forward( uint8_t temp_mask[ELEMENTS_PER_LDG_STG]; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { - int batch_element_count = (i >= local_batches) ? 0 : local_seq; +// int batch_element_count = (i >= local_batches) ? 0 : ele; #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; - if (element_index < batch_element_count) { + if (element_index < element_count) { int itr_idx = i*element_count+it*WARP_SIZE; copy_vector(temp_data, src + itr_idx); copy_vector(temp_mask, mask + itr_idx); From 0e1ef5dc32ec902a0a1b7d4d53330c16778fdeb3 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 03:07:16 +0200 Subject: [PATCH 249/297] Test something out --- .../fused_kernels/scaled_masked_softmax.h | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index f6cf0ce32..e5cafaf44 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -47,6 +47,21 @@ __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t * template <> __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); } +template +__device__ __inline__ void copy_zero_vector(Datatype *dst); + +template <> +__device__ __inline__ void copy_zero_vector(c10::BFloat16 *dst) { *dst = 0.0; } + +template <> +__device__ __inline__ void copy_zero_vector(c10::BFloat16 *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); } + +template <> +__device__ __inline__ void copy_zero_vector(c10::Half *dst) { *dst = 0.0; } + +template <> +__device__ __inline__ void copy_zero_vector(c10::Half *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); } + int log2_ceil(int value) { int log2_value = 0; while ((1 << log2_value) < value) ++log2_value; @@ -312,24 +327,24 @@ __global__ void scaled_masked_softmax_warp_forward( warp_reduce(sum); // store result - output_t out[ELEMENTS_PER_LDG_STG] { 0.0f }; + output_t out[ELEMENTS_PER_LDG_STG]; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { - if (sum[i] == 0.0) { - copy_zero_vector(dst + i * element_count * stride); - break; - } if (i >= local_batches) break; #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; if (element_index < element_count) { - #pragma unroll - for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { - out[element] = elements[i][it + element] / sum[i]; + if (sum[i] == 0.0) { + copy_zero_vector(dst + i * element_count + it * WARP_SIZE); + } else { + #pragma unroll + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + out[element] = elements[i][it + element] / sum[i]; + } + copy_vector(dst + i * element_count + it * WARP_SIZE, out); } - copy_vector(dst + i * element_count + it * WARP_SIZE, out); } else { break; } From 964a275f772b1f4453d0c900108b7d8c17fdb9d0 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 03:14:08 +0200 Subject: [PATCH 250/297] Test something out --- megatron/fused_kernels/scaled_masked_softmax.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index e5cafaf44..798c9145f 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -271,13 +271,13 @@ __global__ void scaled_masked_softmax_warp_forward( uint8_t temp_mask[ELEMENTS_PER_LDG_STG]; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { -// int batch_element_count = (i >= local_batches) ? 0 : ele; + int batch_element_count = (i >= local_batches) ? 0 : element_count; #pragma unroll for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; - if (element_index < element_count) { + if (element_index < batch_element_count) { int itr_idx = i*element_count+it*WARP_SIZE; copy_vector(temp_data, src + itr_idx); copy_vector(temp_mask, mask + itr_idx); From 8b31e9cace905e9c203794c58b9c992748bd1317 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 03:29:17 +0200 Subject: [PATCH 251/297] Test something out --- megatron/fused_kernels/scaled_masked_softmax.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 798c9145f..7b11e0f35 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -241,7 +241,7 @@ __global__ void scaled_masked_softmax_warp_forward( // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) // gridDim/blockIdx = (seq_len, attn_heads, batches) - int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH; + int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) + threadIdx.y) * WARP_BATCH; int pad_first_batch = 0; if (pad_batches != 1) { // bert style pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH; @@ -316,8 +316,8 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] < -std::numeric_limits::infinity()) { - elements[i][it] = 0.0f; + if (elements[i][it] =< -std::numeric_limits::infinity()) { + elements[i][it] = 0; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); } @@ -336,7 +336,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; if (element_index < element_count) { - if (sum[i] == 0.0) { + if (sum[i] == 0.) { copy_zero_vector(dst + i * element_count + it * WARP_SIZE); } else { #pragma unroll From 723a5b399aff03a39d64fe237ba4cf20233a10c2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 03:32:30 +0200 Subject: [PATCH 252/297] Test something out --- megatron/fused_kernels/scaled_masked_softmax.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 7b11e0f35..458e7adee 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -316,7 +316,7 @@ __global__ void scaled_masked_softmax_warp_forward( for (int i = 0; i < WARP_BATCH; ++i) { #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { - if (elements[i][it] =< -std::numeric_limits::infinity()) { + if (elements[i][it] <= -std::numeric_limits::infinity()) { elements[i][it] = 0; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); From 65b4ea2835bdc3d91d87fb6287e4bcc0aec986ed Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 05:29:08 +0200 Subject: [PATCH 253/297] Test something out --- finetune_t0_non_causal_decoder.py | 5 +++-- .../fused_kernels/scaled_masked_softmax.h | 22 +++++++++---------- tests/test_model.py | 3 +++ 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index cb6776b19..c1587d5cb 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -5,7 +5,7 @@ from megatron import get_args, get_tokenizer, print_rank_0, mpu from megatron.data.mtf_dataset import build_train_valid_test_datasets -from megatron.enums import PositionEmbeddingType +from megatron.enums import PositionEmbeddingType, AttnMaskType from megatron.model import GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask @@ -36,7 +36,8 @@ def model_provider(pre_process=True, post_process=True): if args.deepspeed: model = GPTModelPipe( num_tokentypes=0, - parallel_output=True + parallel_output=True, + attn_mask_type=AttnMaskType.custom ) # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 458e7adee..dfa48148c 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -128,7 +128,7 @@ __global__ void scaled_softmax_warp_forward( // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) // gridDim/blockIdx = (seq_len, attn_heads, batches) - int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH; + int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) + threadIdx.y) * WARP_BATCH; // micro_batch_size might not be a multiple of WARP_BATCH. Check how // many batches have to computed within this WARP. @@ -278,18 +278,18 @@ __global__ void scaled_masked_softmax_warp_forward( int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; if (element_index < batch_element_count) { - int itr_idx = i*element_count+it*WARP_SIZE; + int itr_idx = i * element_count * element_count + it * WARP_SIZE; copy_vector(temp_data, src + itr_idx); copy_vector(temp_mask, mask + itr_idx); #pragma unroll - for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { - if (temp_mask[element] != 1) { - elements[i][it + element] = (acc_t)temp_data[element] * scale; - } else { - elements[i][it + element] = -std::numeric_limits::infinity(); - } - } + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + if (temp_mask[element] != 1) { + elements[i][it + element] = (acc_t)temp_data[element] * scale; + } else { + elements[i][it + element] = -std::numeric_limits::infinity(); + } + } } else { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { @@ -337,13 +337,13 @@ __global__ void scaled_masked_softmax_warp_forward( int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; if (element_index < element_count) { if (sum[i] == 0.) { - copy_zero_vector(dst + i * element_count + it * WARP_SIZE); + copy_zero_vector(dst + i * element_count * element_count + it * WARP_SIZE); } else { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { out[element] = elements[i][it + element] / sum[i]; } - copy_vector(dst + i * element_count + it * WARP_SIZE, out); + copy_vector(dst + i * element_count * element_count + it * WARP_SIZE , out); } } else { break; diff --git a/tests/test_model.py b/tests/test_model.py index 967b1182c..1026f6acc 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -400,6 +400,9 @@ def test_fused_masked_softmax(self): output = output.masked_fill(dummy_attention_mask, -10000) output = F.softmax(output, dim=-1) + # Test that the nonzeros are the same with the mask + for i in range(args.num_attention_heads): + torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0])) # Issue is we use -10000 in mimicking instead of `inf` torch_assert_close(fused_output, output) From 7eaced4567e1a0941fb4faa9b759e853e589d438 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 05:48:29 +0200 Subject: [PATCH 254/297] Maybe nothing is wrong --- .../fused_kernels/scaled_masked_softmax.h | 121 +++++++++--------- megatron/model/fused_softmax.py | 14 +- 2 files changed, 70 insertions(+), 65 deletions(-) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index dfa48148c..612909d1c 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -47,9 +47,6 @@ __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t * template <> __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); } -template -__device__ __inline__ void copy_zero_vector(Datatype *dst); - template <> __device__ __inline__ void copy_zero_vector(c10::BFloat16 *dst) { *dst = 0.0; } @@ -62,6 +59,7 @@ __device__ __inline__ void copy_zero_vector(c10::Half *dst) { *dst template <> __device__ __inline__ void copy_zero_vector(c10::Half *dst) { *((float2*) dst) = make_float2(0.0f, 0.0f); } + int log2_ceil(int value) { int log2_value = 0; while ((1 << log2_value) < value) ++log2_value; @@ -109,16 +107,16 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) { /* * Extended softmax (from native aten pytorch) with following additional features * 1) input scaling - */ + */ template __global__ void scaled_softmax_warp_forward( - output_t *dst, + output_t *dst, const input_t *src, - const acc_t scale, - int micro_batch_size, + const acc_t scale, + int micro_batch_size, int element_count) { - // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and // warp_size of method warp_softmax_forward_kernel. constexpr int next_power_of_two = 1 << log2_elements; constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; @@ -127,8 +125,8 @@ __global__ void scaled_softmax_warp_forward( constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4; // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) - // gridDim/blockIdx = (seq_len, attn_heads, batches) - int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) + threadIdx.y) * WARP_BATCH; + // gridDim/blockIdx = (seq_len, attn_heads, batches) + int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH; // micro_batch_size might not be a multiple of WARP_BATCH. Check how // many batches have to computed within this WARP. @@ -207,10 +205,10 @@ __global__ void scaled_softmax_warp_forward( for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { out[element] = elements[i][it + element] / sum[i]; } - copy_vector(dst + i * element_count + it * WARP_SIZE, out); + copy_vector(dst + i * element_count + it * WARP_SIZE, out); } else { break; - } + } } } } @@ -220,18 +218,18 @@ __global__ void scaled_softmax_warp_forward( * Extended softmax (from native aten pytorch) with following additional features * 1) input scaling * 2) Explicit masking - */ + */ template __global__ void scaled_masked_softmax_warp_forward( - output_t *dst, + output_t *dst, const input_t *src, - const uint8_t *mask, - const acc_t scale, - int micro_batch_size, + const uint8_t *mask, + const acc_t scale, + int micro_batch_size, int element_count, - int pad_batches) + int pad_batches) { - // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and // warp_size of method warp_softmax_forward_kernel. constexpr int next_power_of_two = 1 << log2_elements; constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; @@ -241,7 +239,7 @@ __global__ void scaled_masked_softmax_warp_forward( // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) // gridDim/blockIdx = (seq_len, attn_heads, batches) - int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) + threadIdx.y) * WARP_BATCH; + int first_batch = (blockDim.y * (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z))+ threadIdx.y) * WARP_BATCH; int pad_first_batch = 0; if (pad_batches != 1) { // bert style pad_first_batch = (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) * WARP_BATCH; @@ -249,9 +247,6 @@ __global__ void scaled_masked_softmax_warp_forward( pad_first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; } -// int local_seq = blockIdx.x + 1; -// int warp_iteration_limit = (local_seq + ELEMENTS_PER_LDG_STG * WARP_SIZE - 1)/ WARP_SIZE; - // micro_batch_size might not be a multiple of WARP_BATCH. Check how // many batches have to computed within this WARP. int local_batches = micro_batch_size - first_batch; @@ -278,18 +273,18 @@ __global__ void scaled_masked_softmax_warp_forward( int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; if (element_index < batch_element_count) { - int itr_idx = i * element_count * element_count + it * WARP_SIZE; + int itr_idx = i*element_count+it*WARP_SIZE; copy_vector(temp_data, src + itr_idx); copy_vector(temp_mask, mask + itr_idx); #pragma unroll - for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { - if (temp_mask[element] != 1) { - elements[i][it + element] = (acc_t)temp_data[element] * scale; - } else { - elements[i][it + element] = -std::numeric_limits::infinity(); - } - } + for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { + if (temp_mask[element] != 1) { + elements[i][it + element] = (acc_t)temp_data[element] * scale; + } else { + elements[i][it + element] = -std::numeric_limits::infinity(); + } + } } else { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { @@ -317,7 +312,7 @@ __global__ void scaled_masked_softmax_warp_forward( #pragma unroll for (int it = 0; it < WARP_ITERATIONS; ++it) { if (elements[i][it] <= -std::numeric_limits::infinity()) { - elements[i][it] = 0; + elements[i][it] = 0.0f; } else { elements[i][it] = std::exp((elements[i][it] - max_value[i])); } @@ -336,32 +331,32 @@ __global__ void scaled_masked_softmax_warp_forward( for (int it = 0; it < WARP_ITERATIONS; it+=ELEMENTS_PER_LDG_STG) { int element_index = ELEMENTS_PER_LDG_STG * local_idx + it * WARP_SIZE; if (element_index < element_count) { - if (sum[i] == 0.) { - copy_zero_vector(dst + i * element_count * element_count + it * WARP_SIZE); + if (sum[i] == 0.0f) { + copy_zero_vector(dst + i * element_count + it * WARP_SIZE); } else { #pragma unroll for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { out[element] = elements[i][it + element] / sum[i]; } - copy_vector(dst + i * element_count * element_count + it * WARP_SIZE , out); + copy_vector(dst + i * element_count + it * WARP_SIZE, out); } } else { break; - } + } } } } template __global__ void scaled_masked_softmax_warp_backward( - output_t *gradInput, - input_t *grad, + output_t *gradInput, + input_t *grad, const input_t *output, - acc_t scale, - int micro_batch_size, + acc_t scale, + int micro_batch_size, int element_count) { - // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and + // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and // warp_size of method warp_softmax_backward_kernel. constexpr int next_power_of_two = 1 << log2_elements; constexpr int WARP_SIZE = (next_power_of_two < C10_WARP_SIZE) ? next_power_of_two : C10_WARP_SIZE; @@ -370,9 +365,9 @@ __global__ void scaled_masked_softmax_warp_backward( constexpr int ELEMENTS_PER_LDG_STG = (WARP_ITERATIONS < 4) ? 1 : 4; // blockDim/threadIdx = (WARP_SIZE, WARPS_PER_BLOCK, ) - // gridDim/blockIdx = (seq_len, attn_heads, batches) + // gridDim/blockIdx = (seq_len, attn_heads, batches) int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH; - + // micro_batch_size might not be a multiple of WARP_BATCH. Check how // many batches have to computed within this WARP. int local_batches = micro_batch_size - first_batch; @@ -412,10 +407,10 @@ __global__ void scaled_masked_softmax_warp_backward( for (int element = 0; element < ELEMENTS_PER_LDG_STG; ++element) { grad_reg[i][it + element] = (acc_t)temp_grad[element] * output_reg[i][it + element]; } - } + } } } - + acc_t sum[WARP_BATCH]; #pragma unroll for (int i = 0; i < WARP_BATCH; ++i) { @@ -443,7 +438,7 @@ __global__ void scaled_masked_softmax_warp_backward( out[element] = (output_t)(scale * (grad_reg[i][it + element] - output_reg[i][it + element] * sum[i])); } copy_vector(gradInput + i * element_count + it * WARP_SIZE, out); - } + } } } } @@ -465,11 +460,11 @@ int get_batch_per_block(int query_seq_len, int key_seq_len, int batches, int att template void dispatch_scaled_softmax_forward( - output_t *dst, - const input_t *src, - const input_t scale, - int query_seq_len, - int key_seq_len, + output_t *dst, + const input_t *src, + const input_t scale, + int query_seq_len, + int key_seq_len, int batches, int attn_heads) { @@ -557,12 +552,12 @@ void dispatch_scaled_softmax_forward( template void dispatch_scaled_masked_softmax_forward( - output_t *dst, - const input_t *src, + output_t *dst, + const input_t *src, const uint8_t *mask, - const input_t scale, - int query_seq_len, - int key_seq_len, + const input_t scale, + int query_seq_len, + int key_seq_len, int batches, int attn_heads, int pad_batches) @@ -651,12 +646,12 @@ void dispatch_scaled_masked_softmax_forward( template void dispatch_scaled_masked_softmax_backward( - output_t *grad_input, - input_t *grad, - const input_t *output, - const acc_t scale, - int query_seq_len, - int key_seq_len, + output_t *grad_input, + input_t *grad, + const input_t *output, + const acc_t scale, + int query_seq_len, + int key_seq_len, int batches, int attn_heads) { @@ -740,4 +735,4 @@ void dispatch_scaled_masked_softmax_backward( break; } } -} +} \ No newline at end of file diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index a3054d730..30b3a5262 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -157,7 +157,16 @@ def forward(self, input, mask): assert input.dim() == 4 if self.is_kernel_available(mask, *input.size()): - return self.forward_fused_softmax(input, mask) + result = self.forward_fused_softmax(input, mask) + for batch_id in range(len(mask)): + print("Batch id", batch_id) + print(" inputs", input.shape, input[batch_id, 0]) + print(" mask", mask.shape, mask[batch_id, 0]) + print(" result", result.shape, result[batch_id, 0]) + print(" hello", torch.nonzero(~mask[batch_id, 0])[100:150]) + print(" bye", torch.nonzero(result[batch_id, 0])[41:100]) + print(" all ones?", torch.sum(result, dim=-1)) + return result else: return self.forward_torch_softmax(input, mask) @@ -186,8 +195,9 @@ def forward_fused_softmax(self, input, mask): b, np, sq, sk = input.size() scale = self.scale if self.scale is not None else 1.0 - if self.attn_mask_type == AttnMaskType.causal and mask is None: + if self.attn_mask_type == AttnMaskType.causal: assert sq == sk, "causal mask is only for self attention" + assert mask is None # input is 3D tensor (attn_batches, sq, sk) input = input.view(-1, sq, sk) From da9f3160679fa3eb042e89338af9bb5fd4d2c6ab Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 06:01:37 +0200 Subject: [PATCH 255/297] Woops --- megatron/fused_kernels/scaled_masked_softmax.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 612909d1c..73bbe65a8 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -47,6 +47,9 @@ __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t * template <> __device__ __inline__ void copy_vector(uint8_t *dst, const uint8_t *src) {*((half2*) dst) = *((half2*) src); } +template +__device__ __inline__ void copy_zero_vector(Datatype *dst); + template <> __device__ __inline__ void copy_zero_vector(c10::BFloat16 *dst) { *dst = 0.0; } From 8b67bd988eb3b2295fd01ab45c2b43b7cb8cd28e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 06:14:35 +0200 Subject: [PATCH 256/297] Use bloom instead --- tests/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_training.py b/tests/test_training.py index 260a54ba3..d877136e2 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -512,7 +512,7 @@ def test_training_t0(self): --checkpoint-activations --exit-interval {exit_interval} --tokenizer-type PretrainedFromHF - --tokenizer-name-or-path gpt2 + --tokenizer-name-or-path bigscience/tokenizer --log-path {logs_dir} --save {output_dir}/checkpoints --load {output_dir}/checkpoints From 84007bc2a4fda0ab258500fa23bc21c0ce4692c0 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 06:35:09 +0200 Subject: [PATCH 257/297] Make MTF dataloader an infinite dataloader --- megatron/data/data_samplers.py | 57 +++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 3e7df32d3..41154868a 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -305,6 +305,7 @@ def __init__(self, sequence_length, dataset, total_samples, consumed_samples, mi self.micro_batch_size * data_parallel_size self.last_batch_size = \ self.total_samples % self.micro_batch_times_data_parallel_size + self.active_total_samples = self.total_samples - self.last_batch_size # Sanity checks. assert self.total_samples > 0, \ @@ -319,9 +320,7 @@ def __len__(self): return self.total_samples def __iter__(self): - active_total_samples = self.total_samples - self.last_batch_size - self.epoch = self.consumed_samples // active_total_samples - current_epoch_samples = self.consumed_samples % active_total_samples + current_epoch_samples = self.consumed_samples % self.active_total_samples assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 # data sharding and random sampling @@ -331,26 +330,34 @@ def __iter__(self): start_idx = self.data_parallel_rank * bucket_size g = torch.Generator() - g.manual_seed(self.epoch) - - random_idx = torch.randperm(bucket_size, generator=g).tolist() - idx_range = [start_idx + x for x in random_idx[bucket_offset:]] - batch = [] - batch_count = 0 - token_lens = 0 - # Last batch if not complete will be dropped. - for idx in idx_range: - tok_len = len(self.dataset[idx]['input_tokens']) + len(self.dataset[idx]['target_tokens']) - if token_lens + tok_len > self.sequence_length: - batch_count += 1 - token_lens = 0 - - if batch_count == self.micro_batch_size: - self.consumed_samples += self.micro_batch_times_data_parallel_size - yield batch - batch_count = 0 - batch = [] - else: - token_lens += tok_len - batch.append(idx) + # Infinite loader + while True: + g.manual_seed(self.epoch) + + # Randomly shuffle the dataset + random_idx = torch.randperm(bucket_size, generator=g).tolist() + idx_range = [start_idx + x for x in random_idx[bucket_offset:]] + + batch = [] + batch_count = 0 + token_lens = 0 + # Last batch if not complete will be dropped. + for idx in idx_range: + tok_len = len(self.dataset[idx]['input_tokens']) + len(self.dataset[idx]['target_tokens']) + if token_lens + tok_len > self.sequence_length: + batch_count += 1 + token_lens = 0 + + if batch_count == self.micro_batch_size: + self.consumed_samples += self.micro_batch_times_data_parallel_size + yield batch + batch_count = 0 + batch = [] + else: + token_lens += tok_len + batch.append(idx) + + @property + def epoch(self): + return self.consumed_samples // self.active_total_samples From 273d420b63c6f31e0a144216536080a134a12d1a Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 12:25:14 +0200 Subject: [PATCH 258/297] Work into moving packing logic into a dataset --- megatron/data/data_samplers.py | 205 ++------ megatron/data/decoder_packed_mtf_dataset.py | 494 ++++++++++++++++++++ megatron/data/mtf_dataset.py | 10 +- megatron/tokenizer/tokenizer.py | 22 +- tests/test_dataloaders.py | 76 ++- 5 files changed, 624 insertions(+), 183 deletions(-) create mode 100644 megatron/data/decoder_packed_mtf_dataset.py diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 41154868a..00d2165b9 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -20,73 +20,10 @@ from megatron import get_args, get_tokenizer from megatron import mpu +from megatron.data.decoder_packed_mtf_dataset import DecoderPackedMTFDataset from megatron.data.mtf_dataset import MTFDataset -def pack_samples(items, max_seq_len: int, micro_batch_size: int, pad_token: int): - """ - Greedily packs samples. - - Items: - [ - { - 'input_tokens': array([6, 7]), - 'target_tokens': array([8]) - }, - { - 'input_tokens': array([3, 4]), - 'target_tokens': array([5]) - } - ] - - Output: - decoder_tokens = [[6, 7, 8, 3, 4, 5, ]]: Concatenation of tokens followed with padding tokens. - decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]: Segment ids determine original documents. - decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]: `1` depicts inputs, `0` depicts target. - """ - - decoder_tokens = torch.full((micro_batch_size, max_seq_len), pad_token, dtype=torch.int64) - decoder_segment_ids = torch.zeros((micro_batch_size, max_seq_len), dtype=torch.int64) - decoder_is_inputs = torch.full((micro_batch_size, max_seq_len), False, dtype=torch.bool) - - batch_num = 0 - # `0` is reserved for padding - item_num = 1 - cur_len = 0 - for token_dict in items: - input_token_len = len(token_dict["input_tokens"]) - target_token_len = len(token_dict["target_tokens"]) - total_len = input_token_len + target_token_len - if cur_len + total_len > max_seq_len: - len_diff = max_seq_len - cur_len - # Padding - if len_diff > 0: - decoder_tokens[batch_num][cur_len: max_seq_len] = pad_token - decoder_segment_ids[batch_num][cur_len: max_seq_len] = 0 - # padded values are already 0, no need to update `decoder_is_inputs` - batch_num += 1 - assert batch_num < micro_batch_size - item_num = 1 - cur_len = 0 - - decoder_tokens[batch_num][cur_len: cur_len + input_token_len] = torch.from_numpy(token_dict["input_tokens"]) - decoder_tokens[batch_num][cur_len + input_token_len: cur_len + total_len] = torch.from_numpy(token_dict["target_tokens"]) - decoder_segment_ids[batch_num][cur_len: cur_len + total_len] = item_num - decoder_is_inputs[batch_num][cur_len: cur_len + input_token_len] = 1 # inputs - # targets are already 0 at init, no need to update `decoder_is_inputs` - - item_num += 1 - cur_len += total_len - assert cur_len < max_seq_len - - # Normally the default collate_fn handles torch tensor conversion; As we use a custom collate_fn, do it here - return { - "decoder_token_ids": decoder_tokens, - "decoder_segment_ids": decoder_segment_ids, - "decoder_is_inputs": decoder_is_inputs, - } - - def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): """Buld dataloader given an input dataset.""" @@ -95,6 +32,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): args = get_args() # Megatron sampler + collate_fn = Noen if args.dataloader_type == 'single': batch_sampler = MegatronPretrainingSampler( total_samples=len(dataset), @@ -110,23 +48,14 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) elif args.dataloader_type == 'decoder_packed': - assert isinstance(dataset, MTFDataset) - batch_sampler = MegatronDecoderPackedText2TextRandomSampler( - sequence_length=args.seq_length + 1, - dataset=dataset, - total_samples=len(dataset), - consumed_samples=consumed_samples, - micro_batch_size=args.micro_batch_size, - data_parallel_rank=mpu.get_data_parallel_rank(), - data_parallel_size=mpu.get_data_parallel_world_size()) - elif args.dataloader_type == 'packed': - batch_sampler = MegatronPackedRandomSampler( - sequence_length=args.seq_length, + assert isinstance(dataset, DecoderPackedMTFDataset) + batch_sampler = MegatronDecoderPackedText2TextSampler( total_samples=len(dataset), consumed_samples=consumed_samples, micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) + collate_fn = concatenate_dict_of_tensor else: raise Exception('{} dataloader type is not supported.'.format( args.dataloader_type)) @@ -135,11 +64,11 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): num_workers = args.num_workers collate_fn = None - if args.dataloader_type == 'decoder_packed': - assert isinstance(dataset, MTFDataset) - pad_token = get_tokenizer().pad - collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1, micro_batch_size=args.micro_batch_size, - pad_token=pad_token) + # if args.dataloader_type == 'decoder_packed': + # assert isinstance(dataset, MTFDataset) + # pad_token = get_tokenizer().pad + # collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1, micro_batch_size=args.micro_batch_size, + # pad_token=pad_token) # Torch dataloader. return torch.utils.data.DataLoader( @@ -226,35 +155,6 @@ def __init__(self, total_samples, consumed_samples, micro_batch_size, def __len__(self): return self.total_samples - -class MegatronPackedRandomSampler(object): - """docstring for MegatronPackedRandomSampler""" - def __init__(self, sequence_length, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size): - # Keep a copy of input params for later use. - self.sequence_length = sequence_length - self.total_samples = total_samples - self.consumed_samples = consumed_samples - self.micro_batch_size = micro_batch_size - self.data_parallel_rank = data_parallel_rank - self.data_parallel_size = data_parallel_size - self.micro_batch_times_data_parallel_size = \ - self.micro_batch_size * data_parallel_size - self.last_batch_size = \ - self.total_samples % self.micro_batch_times_data_parallel_size - - # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) - assert self.micro_batch_size > 0 - assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) - - def __len__(self): - return self.total_samples - def __iter__(self): active_total_samples = self.total_samples - self.last_batch_size self.epoch = self.consumed_samples // active_total_samples @@ -283,33 +183,26 @@ def __iter__(self): batch = [] -class MegatronDecoderPackedText2TextRandomSampler(object): - """ - Converts a two stream dataset with `input_tokens` and `target_tokens` and creates a batch that should be greedily - packed to be passed onto the decoder model. +class MegatronDecoderPackedText2TextSampler(object): + """Sampler used with `DecoderPackedMTFDataset""" - To be used with `pack_samples` as collate_fn - """ - - def __init__(self, sequence_length, dataset, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size): + def __init__(self, total_samples, consumed_samples, micro_batch_size, + data_parallel_rank, data_parallel_size, drop_last=True): # Keep a copy of input params for later use. - self.dataset = dataset - self.sequence_length = sequence_length self.total_samples = total_samples self.consumed_samples = consumed_samples self.micro_batch_size = micro_batch_size self.data_parallel_rank = data_parallel_rank - self.data_parallel_size = data_parallel_size self.micro_batch_times_data_parallel_size = \ self.micro_batch_size * data_parallel_size - self.last_batch_size = \ - self.total_samples % self.micro_batch_times_data_parallel_size - self.active_total_samples = self.total_samples - self.last_batch_size + self.drop_last = drop_last # Sanity checks. assert self.total_samples > 0, \ 'no sample to consume: {}'.format(self.total_samples) + assert self.consumed_samples < self.total_samples, \ + 'no samples left to consume: {}, {}'.format(self.consumed_samples, + self.total_samples) assert self.micro_batch_size > 0 assert data_parallel_size > 0 assert self.data_parallel_rank < data_parallel_size, \ @@ -319,45 +212,29 @@ def __init__(self, sequence_length, dataset, total_samples, consumed_samples, mi def __len__(self): return self.total_samples - def __iter__(self): - current_epoch_samples = self.consumed_samples % self.active_total_samples - assert current_epoch_samples % self.micro_batch_times_data_parallel_size == 0 + def get_start_end_idx(self): + start_idx = self.data_parallel_rank * self.micro_batch_size + end_idx = start_idx + self.micro_batch_size + return start_idx, end_idx - # data sharding and random sampling - bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size - bucket_offset = current_epoch_samples // self.data_parallel_size - start_idx = self.data_parallel_rank * bucket_size + def __iter__(self): + batch = [] + # Last batch will be dropped if drop_last is not set False + for idx in range(self.consumed_samples, self.total_samples): + batch.append(idx) + if len(batch) == self.micro_batch_times_data_parallel_size: + start_idx, end_idx = self.get_start_end_idx() + yield batch[start_idx:end_idx] + batch = [] - g = torch.Generator() + # Check the last partial batch and see drop_last is set + if len(batch) > 0 and not self.drop_last: + start_idx, end_idx = self.get_start_end_idx() + yield batch[start_idx:end_idx] - # Infinite loader - while True: - g.manual_seed(self.epoch) - - # Randomly shuffle the dataset - random_idx = torch.randperm(bucket_size, generator=g).tolist() - idx_range = [start_idx + x for x in random_idx[bucket_offset:]] - - batch = [] - batch_count = 0 - token_lens = 0 - # Last batch if not complete will be dropped. - for idx in idx_range: - tok_len = len(self.dataset[idx]['input_tokens']) + len(self.dataset[idx]['target_tokens']) - if token_lens + tok_len > self.sequence_length: - batch_count += 1 - token_lens = 0 - - if batch_count == self.micro_batch_size: - self.consumed_samples += self.micro_batch_times_data_parallel_size - yield batch - batch_count = 0 - batch = [] - else: - token_lens += tok_len - batch.append(idx) - - @property - def epoch(self): - return self.consumed_samples // self.active_total_samples +def concatenate_dict_of_tensor(list_dict_of_tensors): + keys = list(list_dict_of_tensors[0].keys()) + result = {} + for key in keys: + result[key] = torch.stack([sample[key] for sample in list_dict_of_tensors]) + return result diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py new file mode 100644 index 000000000..2b1b5f504 --- /dev/null +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -0,0 +1,494 @@ +import os +import time + +import numpy as np +import torch + +from megatron import print_rank_0, mpu +from megatron.data.blendable_dataset import BlendableDataset +from megatron.data.dataset_utils import get_datasets_weights_and_num_samples, get_split_by_range_, \ + get_train_valid_test_split_ +from megatron.data.mtf_dataset import MTFDataset +from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset + + +def build_train_valid_test_datasets( + data_prefix, + data_impl, + splits_string, + seq_length: int, + pad_token: int, + eos_token: int, + train_valid_test_num_samples, + seed, + skip_warmup +): + """Build train, valid, and test datasets.""" + + # Single dataset. + if len(data_prefix) == 1: + all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets( + data_prefix=data_prefix[0], + data_impl=data_impl, + splits_string=splits_string, + seq_length=seq_length, + pad_token=pad_token, + eos_token=eos_token, + train_valid_test_num_samples=train_valid_test_num_samples, + seed=seed, + skip_warmup=skip_warmup + ) + # Blending dataset. + else: + + output = get_datasets_weights_and_num_samples(data_prefix=data_prefix, train_valid_test_num_samples=train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + train_datasets = [] + valid_datasets = [] + test_datasets = [] + for i in range(len(prefixes)): + train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( + data_prefix=prefixes[i], + data_impl=data_impl, + splits_string=splits_string, + seq_length=seq_length, + pad_token=pad_token, + eos_token=eos_token, + train_valid_test_num_samples=datasets_train_valid_test_num_samples[i], + seed=seed, + skip_warmup=skip_warmup + ) + if train_ds: + train_datasets.append(train_ds) + if valid_ds: + valid_datasets.append(valid_ds) + if test_ds: + test_datasets.append(test_ds) + + all_train_datasets = BlendableDataset(train_datasets, weights) \ + if train_datasets else None + all_valid_datasets = BlendableDataset(valid_datasets, weights) \ + if valid_datasets else None + all_test_datasets = BlendableDataset(test_datasets, weights) \ + if test_datasets else None + + return all_train_datasets, all_valid_datasets, all_test_datasets + + +def build_dataset_group( + dataset_group_name, + paths, + weights, + splits, + data_impl, + seq_length: int, + pad_token: int, + eos_token: int, + train_valid_test_num_samples, + seed, + skip_warmup, + train_valid_test +): + ''' + Build a single dataset group corresponding to Option 2 of data loading see arguments.py + a dataset group is passed in the following form + GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 + or alternatively + GIVEN_NAME PATH1 # for a single dataset to be used fully + ''' + + assert train_valid_test in ["train","valid","test"] + + # Single dataset. + if len(paths) == 1: + dataset = _build_single_datasets( + data_prefix=paths[0], + range_string=splits[0], + data_impl=data_impl, + seq_length=seq_length, + pad_token=pad_token, + eos_token=eos_token, + train_valid_test_num_samples=train_valid_test_num_samples, + seed=seed, + skip_warmup=skip_warmup, + dataset_group_name=dataset_group_name, + train_valid_test=train_valid_test + ) + return dataset + # Blending dataset. + else: + + data_prefix = [] + # data_prefix is of the shape: + # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] + for w,p in zip(weights, paths): + data_prefix += [w,p] + + output = get_datasets_weights_and_num_samples(data_prefix, + train_valid_test_num_samples) + prefixes, weights, datasets_train_valid_test_num_samples = output + + # Build individual datasets. + datasets = [] + for i in range(len(prefixes)): + ds = _build_single_datasets( + data_prefix=prefixes[i], + range_string=splits[i], + data_impl=data_impl, + seq_length=seq_length, + pad_token=pad_token, + eos_token=eos_token, + train_valid_test_num_samples=datasets_train_valid_test_num_samples[i], + seed=seed, + skip_warmup=skip_warmup, + dataset_group_name=dataset_group_name, + train_valid_test=train_valid_test + ) + + datasets.append(ds) + all_datasets = BlendableDataset(datasets, weights) + + return all_datasets + +def _build_single_datasets( + data_prefix, + range_string, + data_impl, + seq_length: int, + pad_token: int, + eos_token: int, + train_valid_test_num_samples, + seed, + skip_warmup, + dataset_group_name, + train_valid_test +): + """Build a single dataset""" + + assert train_valid_test in ["train","valid","test"] + index = ["train","valid","test"].index(train_valid_test) + + # Target indexed dataset. + target_indexed_dataset = get_indexed_dataset( + data_prefix=data_prefix, + is_input=False, + data_impl=data_impl, + skip_warmup=skip_warmup + ) + + total_num_of_documents = target_indexed_dataset.sizes.shape[0] + # this corresponds to option2 for data loading on the form + # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 + # splits here is an array of size 2 [start_index, end_index] + splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) + + # Print stats about the splits. + print_rank_0(' > dataset split:') + + print_rank_0(' {}:'.format(dataset_group_name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[0], splits[1], + splits[1] - splits[0])) + + def build_dataset(name): + dataset = None + if splits[1] > splits[0]: + documents = np.arange(start=splits[0], stop=splits[1], + step=1, dtype=np.int32) + dataset = DecoderPackedMTFDataset( + name=name, + data_prefix=data_prefix, + data_impl=data_impl, + skip_warmup=skip_warmup, + documents=documents, + seq_length=seq_length, + pad_token=pad_token, + eos_token=eos_token, + num_samples=train_valid_test_num_samples[index], + seed=seed + ) + return dataset + + dataset = build_dataset(dataset_group_name) + + return dataset + + +def _build_train_valid_test_datasets( + data_prefix, + data_impl, + splits_string, + seq_length: int, + pad_token: int, + eos_token: int, + train_valid_test_num_samples, + seed, + skip_warmup +): + """Build train, valid, and test datasets.""" + + # Target indexed dataset. + target_indexed_dataset = get_indexed_dataset(data_prefix, is_input=False, data_impl=data_impl, skip_warmup=skip_warmup) + + total_num_of_documents = target_indexed_dataset.sizes.shape[0] + # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] + splits = get_train_valid_test_split_(splits_string, total_num_of_documents) + # Print stats about the splits. + print_rank_0(' > dataset split:') + + def print_split_stats(name, index): + print_rank_0(' {}:'.format(name)) + print_rank_0(' document indices in [{}, {}) total of {} ' + 'documents'.format(splits[index], splits[index + 1], + splits[index + 1] - splits[index])) + print_split_stats('train', 0) + print_split_stats('validation', 1) + print_split_stats('test', 2) + + def build_dataset(index, name): + dataset = None + if splits[index + 1] > splits[index]: + documents = np.arange(start=splits[index], stop=splits[index + 1], + step=1, dtype=np.int32) + dataset = DecoderPackedMTFDataset( + name=name, + data_prefix=data_prefix, + data_impl=data_impl, + skip_warmup=skip_warmup, + documents=documents, + seq_length=seq_length, + pad_token=pad_token, + eos_token=eos_token, + num_samples=train_valid_test_num_samples[index], + seed=seed + ) + return dataset + + train_dataset = build_dataset(0, 'train') + valid_dataset = build_dataset(1, 'valid') + test_dataset = build_dataset(2, 'test') + + return (train_dataset, valid_dataset, test_dataset) + + +class DecoderPackedMTFDataset(torch.utils.data.Dataset): + + def __init__( + self, + name, + data_prefix, + data_impl, + skip_warmup, + documents, + num_samples, + seq_length: int, + pad_token: int, + eos_token: int, + seed, + ): + self.mtf_dataset = MTFDataset(name=name, data_prefix=data_prefix, data_impl=data_impl, skip_warmup=skip_warmup, documents=documents, num_samples=num_samples, seed=seed) + + self.pad_token = pad_token + self.seq_length = seq_length + + self.sample_index, self.shuffle_index = _build_index_mappings(name=name, data_prefix=data_prefix, mtf_dataset=self.mtf_dataset, num_samples=num_samples, seq_length=seq_length, seed=self.seed) + + def __len__(self): + return len(self.sample_index) + + def __getitem__(self, idx): + # Get the shuffled index. + start, end = self.sample_index[idx] + mtf_samples_indices = self.shuffle_index[start: end] + # TODO @thomasw21 build a dataset that generates an entire batch instead of a row (allows for more optimization) + items = [self.mtf_dataset[sample_id] for sample_id in mtf_samples_indices] + + return self.pack_samples(items) + + def pack_samples(self, items): + """ + Greedily packs samples. + + Items: + [ + { + 'input_tokens': array([6, 7]), + 'target_tokens': array([8]) + }, + { + 'input_tokens': array([3, 4]), + 'target_tokens': array([5]) + } + ] + + Output: + decoder_tokens = [[6, 7, 8, 3, 4, 5, ]]: Concatenation of tokens followed with padding tokens. + decoder_segment_ids = [[1, 1, 1, 2, 2, 2, 0]]: Segment ids determine original documents. + decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]: `1` depicts inputs, `0` depicts target. + """ + + decoder_tokens = torch.full((self.seq_size), self.pad_token, dtype=torch.int64) + decoder_segment_ids = torch.zeros((self.seq_size), dtype=torch.int64) + decoder_is_inputs = torch.full((self.seq_size), False, dtype=torch.bool) + + # `0` is reserved for padding + item_num = 1 + cur_len = 0 + for token_dict in items: + input_token_len = len(token_dict["input_tokens"]) + target_token_len = len(token_dict["target_tokens"]) + total_len = input_token_len + target_token_len + + if cur_len + total_len > self.seq_size: + break + + decoder_tokens[cur_len: cur_len + input_token_len] = torch.from_numpy(token_dict["input_tokens"]) + decoder_tokens[cur_len + input_token_len: cur_len + total_len] = torch.from_numpy( + token_dict["target_tokens"]) + decoder_segment_ids[cur_len: cur_len + total_len] = item_num + decoder_is_inputs[cur_len: cur_len + input_token_len] = 1 # inputs + # targets are already 0 at init, no need to update `decoder_is_inputs` + + item_num += 1 + cur_len += total_len + assert cur_len < self.seq_size + + # Normally the default collate_fn handles torch tensor conversion; As we use a custom collate_fn, do it here + return { + "decoder_token_ids": decoder_tokens, + "decoder_segment_ids": decoder_segment_ids, + "decoder_is_inputs": decoder_is_inputs, + } + + +def _build_index_mappings( + name, + data_prefix, + mtf_dataset, + num_samples: int, + seq_length: int, + seed, +): + """ + - `shuffle_index` is [num_epoch * len(self.mtf)] + - `sample_index` is [num_sample, 2] (storing the start and end of the sample). We query the sample via `self.shuffle_index[start:end]` + + TODO @thomas21 Instead of loading individually samples, we save the packing one and for all + """ + # rng state + np_rng = np.random.RandomState(seed=seed) + + # Filename of the index mappings. + _filename = data_prefix + _filename += '_{}_indexmap'.format(name) + _filename += '_{}ns'.format(num_samples) + _filename += '_{}s'.format(seed) + sample_idx_filename = _filename + '_decoder_packed_batch_idx.npy' + shuffle_idx_filename = _filename + '_decoder_packed_shuffle_idx.npy' + + # Build the indexed mapping if not exist. + if torch.distributed.get_rank() == 0: + if (not os.path.isfile(sample_idx_filename)) or \ + (not os.path.isfile(shuffle_idx_filename)): + + print_rank_0(' > WARNING: could not find index map files, building ' + 'the indices on rank 0 ...') + + # iteratively add the entire dataset for every epoch and see if it's enough given current packing strategy + start_time = time.time() + row_offset = 0 + shuffle_idx = [] + sample_idx = [] + while len(sample_idx) <= num_samples: + new_document_ids = _build_shuffle_idx(dataset_size=len(mtf_dataset), np_rng=np_rng) + # Generate a shuffling of the entire dataset + shuffle_idx.append(new_document_ids) + # Packs them into a single sample + new_samples, row_offset = _build_sample_idx(mtf_dataset=mtf_dataset, seq_length=seq_length, row_offset=row_offset) + sample_idx.extend(new_samples) + + shuffle_idx = np.concatenate(shuffle_idx, axis=0) + sample_idx = np.stack(sample_idx, axis=0) + + np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) + np.save(sample_idx_filename, sample_idx, allow_pickle=True) + print_rank_0(' > elasped time to build and save shuffle-idx and sample-idx mapping' + ' (seconds): {:4f}'.format(time.time() - start_time)) + + # This should be a barrier but nccl barrier assumes + # device_index=rank which is not the case for model + # parallel case + counts = torch.cuda.LongTensor([1]) + torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) + torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) + assert counts[0].item() == ( + torch.distributed.get_world_size() // + torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) + + # Load mappings. + start_time = time.time() + print_rank_0(' > loading doc-idx mapping from {}'.format( + sample_idx_filename)) + sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' > loading shuffle-idx mapping from {}'.format( + shuffle_idx_filename)) + shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') + print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( + time.time() - start_time)) + + return sample_idx, shuffle_idx + +def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset): + """Build start and off index of each `full` batch, return that list of batch + start of the unfinished batch""" + row_length = row_offset + + full_samples = [] + current_sample_start = 0 + for current_sample_end, document_id in enumerate(document_ids): + sample = mtf_dataset[document_id] + + # TODO @thomasw21 figure out if we add tokens + tok_len = len(sample["input_tokens"]) + len(sample["target_tokens"]) + + row_length = row_length + tok_len + if row_length > seq_length: + # current sample can't be added and requires to be added in the next one + full_samples.append(np.asarray([current_sample_start, current_sample_end])) + current_sample_start = current_sample_end + row_length = tok_len + + + return full_samples, row_length + +def _build_shuffle_idx(dataset_size, np_rng): + """Build the range [0, dataset_size) and shuffle.""" + dtype_ = np.uint32 + if dataset_size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + + return np_rng.shuffle(np.arange(stop=dataset_size, step=1, dtype=dtype_)) + + +def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_warmup: bool): + if is_input: + field = "inputs" + else: + field = "targets" + + return get_indexed_dataset_(f"{data_prefix}_{field}_document", data_impl, skip_warmup) + +def get_indexed_dataset_(path, data_impl, skip_warmup): + """Build indexed dataset.""" + print_rank_0(' > building dataset index ...') + start_time = time.time() + indexed_dataset = make_indexed_dataset(path, + data_impl, + skip_warmup) + print_rank_0(' > finished creating indexed dataset in {:4f} ' + 'seconds'.format(time.time() - start_time)) + print_rank_0(' number of documents: {}'.format( + indexed_dataset.sizes.shape[0])) + + return indexed_dataset diff --git a/megatron/data/mtf_dataset.py b/megatron/data/mtf_dataset.py index 044a4ab3a..6f2dd7cac 100644 --- a/megatron/data/mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -262,12 +262,10 @@ def __init__( documents, num_samples, seed, - impossible_token=-100, ): # Params to store. self.name = name - self.impossible_token = impossible_token # Dataset. self.input_indexed_dataset = get_indexed_dataset(data_prefix, is_input=True, data_impl=data_impl, skip_warmup=skip_warmup) @@ -297,8 +295,8 @@ def __len__(self): def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] - input_tokens = self.input_indexed_dataset.get(self.doc_idx[idx]) - target_tokens = self.target_indexed_dataset.get(self.doc_idx[idx]) + input_tokens = self.input_indexed_dataset[self.doc_idx[idx]] + target_tokens = self.target_indexed_dataset[self.doc_idx[idx]] return { 'input_tokens': np.array(input_tokens, dtype=np.int64), @@ -325,8 +323,8 @@ def _build_index_mappings( _filename += '_{}_indexmap'.format(name) _filename += '_{}ns'.format(num_samples) _filename += '_{}s'.format(seed) - doc_idx_filename = _filename + '_doc_idx.npy' - shuffle_idx_filename = _filename + '_shuffle_idx.npy' + doc_idx_filename = _filename + '_mtf_doc_idx.npy' + shuffle_idx_filename = _filename + '_mtf_shuffle_idx.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0: diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 3f0501b7d..ad5756f4d 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -355,9 +355,8 @@ def detokenize(self, token_ids): @property def eod(self): - candidate = self.tokenizer.eos_token_id - self._check_token_candidate(candidate) - return candidate + # TODO @thomasw21 might conflict with + return self.eos @property def cls(self): @@ -384,17 +383,20 @@ def mask(self): return candidate @property - def additional_special_tokens_ids(self): - """ All the additional special tokens you may want to use (list of strings).""" - return self.tokenizer.additional_special_tokens_ids + def bos(self): + raise NotImplementedError("Missing ") @property - def bos_token_id(self): - raise NotImplementedError("Missing ") + def eos(self): + # TODO @thomasw21 might conflict with the notion of + candidate = self.tokenizer.eos + self._check_token_candidate(candidate) + return candidate @property - def eos_token_id(self): - raise NotImplementedError("Missing ") + def additional_special_tokens_ids(self): + """ All the additional special tokens you may want to use (list of strings).""" + return self.tokenizer.additional_special_tokens_ids @staticmethod def _check_token_candidate(candidate): diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 867641906..2ac273d9a 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -7,7 +7,7 @@ import finetune_t0_non_causal_decoder from megatron import global_vars, get_tokenizer, initialize_megatron, get_args -from megatron.data import mlm_dataset, mtf_dataset +from megatron.data import mlm_dataset, mtf_dataset, decoder_packed_mtf_dataset from megatron.data.data_samplers import build_pretraining_data_loader from megatron.testing_utils import TestCasePlus, flatten_arguments, mockenv_context, torch_assert_equal @@ -176,7 +176,7 @@ def test_mtf_dataset(self): # TODO @thomasw21 make sure that input and target are aligned. - def test_mtf_packed_dataloader(self): + def test_decoder_packed_mtf_dataloader(self): command_args = get_default_args() command_args["--data-path"] = f"{self.data_dir}/gpt2/ag_news_prompt" command_args["--dataloader-type"] = "decoder_packed" @@ -196,12 +196,15 @@ def test_mtf_packed_dataloader(self): args.eval_iters * args.global_batch_size, 0 ] - train_ds, valid_ds, test_ds = mtf_dataset.build_train_valid_test_datasets( + train_ds, valid_ds, test_ds = decoder_packed_mtf_dataset.build_train_valid_test_datasets( data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, # TODO @thomasw21 figure how that value works train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length + 1, + pad_token=tokenizer.pad, + eos_token=tokenizer.eos, seed=args.seed, skip_warmup=(not args.mmap_warmup) ) @@ -244,6 +247,73 @@ def test_mtf_packed_dataloader(self): # update `last_padding_size` last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0]) + def test_packed_decoder_mtf_dataset(self): + command_args = get_default_args() + command_args["--data-path"] = f"{self.data_dir}/gpt2/ag_news_prompt" + + with patch('sys.argv', flatten_arguments(command_args)): + with mockenv_context(**self.dist_env_1_gpu): + deepspeed.init_distributed() + initialize_megatron() + + args = get_args() + tokenizer = get_tokenizer() + # Hack: `gpt2` doesn't have a padding token, so we override that value. + tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id + + train_val_test_num_samples = [ + args.train_iters * args.global_batch_size, + args.eval_iters * args.global_batch_size, + 0 + ] + train_ds, valid_ds, test_ds = decoder_packed_mtf_dataset.build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + # TODO @thomasw21 figure how that value works + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length + 1, + pad_token=tokenizer.pad, + eos_token=tokenizer.eos, + seed=args.seed, + skip_warmup=(not args.mmap_warmup) + ) + + last_padding_size = 0 + for i, samples in enumerate(batch_sampler): + micro_batch_size, seq_length = items["decoder_token_ids"].shape + + # Check dtypes + self.assertEqual(items["decoder_token_ids"].dtype, torch.int64) + self.assertEqual(items["decoder_segment_ids"].dtype, torch.int64) + self.assertEqual(items["decoder_is_inputs"].dtype, torch.bool) + + # `micro_batch_size` correspond to the one in argument + self.assertEqual(micro_batch_size, args.micro_batch_size) + # `seq_length` correspond to the one in argument + 1 in order to get tokens/labels + self.assertEqual(seq_length, args.seq_length + 1) + + original_samples_count = 0 + for batch_id in range(micro_batch_size): + segment_ids = [k for k, _ in itertools.groupby(items["decoder_segment_ids"][batch_id])] + # `segment_ids` is [1,2,...] + self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids)))) + # `0` signify that the tokens are padding + self.assertEqual(segment_ids[-1], 0) + original_samples_count += len([segment_id for segment_id in segment_ids if segment_id != 0]) + + # Test that we actually pack, ie we have more samples than the `batch_size` + self.assertGreater(original_samples_count, micro_batch_size) + + # Test that the first sample of each batch couldn't fit inside the previous batch + first_sample_segment_ids = next(itertools.groupby(items["decoder_segment_ids"][0]))[1] + first_sample_size = len(list(first_sample_segment_ids)) + self.assertGreater(first_sample_size, last_padding_size) + + # update `last_padding_size` + last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0]) + + def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): command_args = get_default_args() command_args["--position-embedding-type"] = "alibi" From 688d06e476e618876bd060076e965006f97260f9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 13:40:04 +0200 Subject: [PATCH 259/297] Woops --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index ad5756f4d..9f4e5b772 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -389,7 +389,7 @@ def bos(self): @property def eos(self): # TODO @thomasw21 might conflict with the notion of - candidate = self.tokenizer.eos + candidate = self.tokenizer.eos_token_id self._check_token_candidate(candidate) return candidate From ddc6a61a04e050cd2c68b6fd6fd3117b02d0cf91 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 13:40:37 +0200 Subject: [PATCH 260/297] Woops --- megatron/data/data_samplers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 00d2165b9..05a86d39c 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -18,10 +18,9 @@ from functools import partial import torch -from megatron import get_args, get_tokenizer +from megatron import get_args from megatron import mpu from megatron.data.decoder_packed_mtf_dataset import DecoderPackedMTFDataset -from megatron.data.mtf_dataset import MTFDataset def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): @@ -32,7 +31,7 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): args = get_args() # Megatron sampler - collate_fn = Noen + collate_fn = None if args.dataloader_type == 'single': batch_sampler = MegatronPretrainingSampler( total_samples=len(dataset), From 0e34e8d109f18878f909d9cec6bf81378717afb5 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 13:52:46 +0200 Subject: [PATCH 261/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 12 ++-- tests/test_dataloaders.py | 66 --------------------- 2 files changed, 6 insertions(+), 72 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 2b1b5f504..e3f549685 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -293,7 +293,7 @@ def __init__( self.pad_token = pad_token self.seq_length = seq_length - self.sample_index, self.shuffle_index = _build_index_mappings(name=name, data_prefix=data_prefix, mtf_dataset=self.mtf_dataset, num_samples=num_samples, seq_length=seq_length, seed=self.seed) + self.sample_index, self.shuffle_index = _build_index_mappings(name=name, data_prefix=data_prefix, mtf_dataset=self.mtf_dataset, num_samples=num_samples, seq_length=seq_length, seed=seed) def __len__(self): return len(self.sample_index) @@ -329,9 +329,9 @@ def pack_samples(self, items): decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]: `1` depicts inputs, `0` depicts target. """ - decoder_tokens = torch.full((self.seq_size), self.pad_token, dtype=torch.int64) - decoder_segment_ids = torch.zeros((self.seq_size), dtype=torch.int64) - decoder_is_inputs = torch.full((self.seq_size), False, dtype=torch.bool) + decoder_tokens = torch.full((self.seq_length,), self.pad_token, dtype=torch.int64) + decoder_segment_ids = torch.zeros((self.seq_length,), dtype=torch.int64) + decoder_is_inputs = torch.full((self.seq_length,), False, dtype=torch.bool) # `0` is reserved for padding item_num = 1 @@ -341,7 +341,7 @@ def pack_samples(self, items): target_token_len = len(token_dict["target_tokens"]) total_len = input_token_len + target_token_len - if cur_len + total_len > self.seq_size: + if cur_len + total_len > self.seq_length: break decoder_tokens[cur_len: cur_len + input_token_len] = torch.from_numpy(token_dict["input_tokens"]) @@ -353,7 +353,7 @@ def pack_samples(self, items): item_num += 1 cur_len += total_len - assert cur_len < self.seq_size + assert cur_len < self.seq_length # Normally the default collate_fn handles torch tensor conversion; As we use a custom collate_fn, do it here return { diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 2ac273d9a..2e01aea1d 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -247,72 +247,6 @@ def test_decoder_packed_mtf_dataloader(self): # update `last_padding_size` last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0]) - def test_packed_decoder_mtf_dataset(self): - command_args = get_default_args() - command_args["--data-path"] = f"{self.data_dir}/gpt2/ag_news_prompt" - - with patch('sys.argv', flatten_arguments(command_args)): - with mockenv_context(**self.dist_env_1_gpu): - deepspeed.init_distributed() - initialize_megatron() - - args = get_args() - tokenizer = get_tokenizer() - # Hack: `gpt2` doesn't have a padding token, so we override that value. - tokenizer.tokenizer.pad_token_id = tokenizer.tokenizer.eos_token_id - - train_val_test_num_samples = [ - args.train_iters * args.global_batch_size, - args.eval_iters * args.global_batch_size, - 0 - ] - train_ds, valid_ds, test_ds = decoder_packed_mtf_dataset.build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - # TODO @thomasw21 figure how that value works - train_valid_test_num_samples=train_val_test_num_samples, - seq_length=args.seq_length + 1, - pad_token=tokenizer.pad, - eos_token=tokenizer.eos, - seed=args.seed, - skip_warmup=(not args.mmap_warmup) - ) - - last_padding_size = 0 - for i, samples in enumerate(batch_sampler): - micro_batch_size, seq_length = items["decoder_token_ids"].shape - - # Check dtypes - self.assertEqual(items["decoder_token_ids"].dtype, torch.int64) - self.assertEqual(items["decoder_segment_ids"].dtype, torch.int64) - self.assertEqual(items["decoder_is_inputs"].dtype, torch.bool) - - # `micro_batch_size` correspond to the one in argument - self.assertEqual(micro_batch_size, args.micro_batch_size) - # `seq_length` correspond to the one in argument + 1 in order to get tokens/labels - self.assertEqual(seq_length, args.seq_length + 1) - - original_samples_count = 0 - for batch_id in range(micro_batch_size): - segment_ids = [k for k, _ in itertools.groupby(items["decoder_segment_ids"][batch_id])] - # `segment_ids` is [1,2,...] - self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids)))) - # `0` signify that the tokens are padding - self.assertEqual(segment_ids[-1], 0) - original_samples_count += len([segment_id for segment_id in segment_ids if segment_id != 0]) - - # Test that we actually pack, ie we have more samples than the `batch_size` - self.assertGreater(original_samples_count, micro_batch_size) - - # Test that the first sample of each batch couldn't fit inside the previous batch - first_sample_segment_ids = next(itertools.groupby(items["decoder_segment_ids"][0]))[1] - first_sample_size = len(list(first_sample_segment_ids)) - self.assertGreater(first_sample_size, last_padding_size) - - # update `last_padding_size` - last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0]) - def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): command_args = get_default_args() From 014b8b827a0afe75c071cca7e6aaa7ffd4f03994 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 13:55:09 +0200 Subject: [PATCH 262/297] Woops --- tests/test_dataloaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 2e01aea1d..5b38a76df 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -279,7 +279,7 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): loss_mask = loss_mask.cpu() self.assertEqual(loss_mask.dtype, torch.float) - torch_assert_equal(loss_mask.bool(), ~data["decoder_is_inputs"][:, 1:] * (data["decoder_token_ids"] != tokenizer.pad)) + torch_assert_equal(loss_mask.bool(), ~data["decoder_is_inputs"][:, 1:] * (data["decoder_token_ids"][:, :-1] != tokenizer.pad)) torch_assert_equal(tokens, data["decoder_token_ids"][:, :-1]) torch_assert_equal(labels, data["decoder_token_ids"][:, 1:]) From c53622a9a11d4a077d651bba27550b8eb2786046 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 13:59:53 +0200 Subject: [PATCH 263/297] Woops --- finetune_t0_non_causal_decoder.py | 2 +- megatron/data/decoder_packed_mtf_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index c1587d5cb..3faad1951 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -4,7 +4,7 @@ import torch from megatron import get_args, get_tokenizer, print_rank_0, mpu -from megatron.data.mtf_dataset import build_train_valid_test_datasets +from megatron.data.decoder_packed_mtf_dataset import build_train_valid_test_datasets from megatron.enums import PositionEmbeddingType, AttnMaskType from megatron.model import GPTModelPipe from megatron.training import pretrain diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index e3f549685..0daff134b 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -406,7 +406,7 @@ def _build_index_mappings( # Generate a shuffling of the entire dataset shuffle_idx.append(new_document_ids) # Packs them into a single sample - new_samples, row_offset = _build_sample_idx(mtf_dataset=mtf_dataset, seq_length=seq_length, row_offset=row_offset) + new_samples, row_offset = _build_sample_idx(mtf_dataset=mtf_dataset, document_ids=new_document_ids ,seq_length=seq_length, row_offset=row_offset) sample_idx.extend(new_samples) shuffle_idx = np.concatenate(shuffle_idx, axis=0) From ea221a8816ebf60fb05a889d6a6b39de49122b47 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 14:04:50 +0200 Subject: [PATCH 264/297] Woops --- finetune_t0_non_causal_decoder.py | 8 +++++++- megatron/data/decoder_packed_mtf_dataset.py | 7 ++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 3faad1951..9e780d31a 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -106,6 +106,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): args = get_args() train_ds, valid_ds, test_ds = None, None, None + tokenizer = get_tokenizer() + print_rank_0("> building train, validation, and test datasets for T0 ...") # Option 1 of data loading using --data-path # For T0, data has to be provided in the form --data-path input-data target-data input-data2 target-data2 ... @@ -115,9 +117,13 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, + seq_length=args.seq_length, + pad_token=tokenizer.pad, + eos_token=tokenizer.eos, train_valid_test_num_samples=train_val_test_num_samples, seed=args.seed, - skip_warmup=(not args.mmap_warmup)) + skip_warmup=(not args.mmap_warmup) + ) else: raise NotImplementedError("No dataloading argument passed") diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 0daff134b..7e0a9cdde 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -468,7 +468,12 @@ def _build_shuffle_idx(dataset_size, np_rng): if dataset_size >= (np.iinfo(np.uint32).max - 1): dtype_ = np.int64 - return np_rng.shuffle(np.arange(stop=dataset_size, step=1, dtype=dtype_)) + result = np.arange(stop=dataset_size, step=1, dtype=dtype_) + + # in-place shuffling + np_rng.shuffle(result) + + return result def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_warmup: bool): From 3274986387fdf44592c00d347a3cacdaa81ba658 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 14:17:58 +0200 Subject: [PATCH 265/297] Woops --- megatron/data/mtf_dataset.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/megatron/data/mtf_dataset.py b/megatron/data/mtf_dataset.py index 6f2dd7cac..42cd0acfd 100644 --- a/megatron/data/mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -44,7 +44,6 @@ def build_train_valid_test_datasets( data_prefix=data_prefix[0], data_impl=data_impl, splits_string=splits_string, - train_valid_test_num_samples=train_valid_test_num_samples, seed=seed, skip_warmup=skip_warmup ) @@ -63,7 +62,6 @@ def build_train_valid_test_datasets( data_prefix=prefixes[i], data_impl=data_impl, splits_string=splits_string, - train_valid_test_num_samples=datasets_train_valid_test_num_samples[i], seed=seed, skip_warmup=skip_warmup ) @@ -84,9 +82,17 @@ def build_train_valid_test_datasets( return all_train_datasets, all_valid_datasets, all_test_datasets -def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, - train_valid_test_num_samples, - seed, skip_warmup, train_valid_test): +def build_dataset_group( + dataset_group_name, + paths, + weights, + splits, + data_impl, + train_valid_test_num_samples, + seed, + skip_warmup, + train_valid_test +): ''' Build a single dataset group corresponding to Option 2 of data loading see arguments.py a dataset group is passed in the following form @@ -103,7 +109,6 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, data_prefix=paths[0], range_string=splits[0], data_impl=data_impl, - train_valid_test_num_samples=train_valid_test_num_samples, seed=seed, skip_warmup=skip_warmup, dataset_group_name=dataset_group_name, @@ -130,7 +135,6 @@ def build_dataset_group(dataset_group_name, paths, weights, splits, data_impl, data_prefix=prefixes[i], range_string=splits[i], data_impl=data_impl, - train_valid_test_num_samples=datasets_train_valid_test_num_samples[i], seed=seed, skip_warmup=skip_warmup, dataset_group_name=dataset_group_name, @@ -146,7 +150,6 @@ def _build_single_datasets( data_prefix, range_string, data_impl, - train_valid_test_num_samples, seed, skip_warmup, dataset_group_name, @@ -190,7 +193,6 @@ def build_dataset(name): data_impl=data_impl, skip_warmup=skip_warmup, documents=documents, - num_samples=train_valid_test_num_samples[index], seed=seed ) return dataset @@ -204,7 +206,6 @@ def _build_train_valid_test_datasets( data_prefix, data_impl, splits_string, - train_valid_test_num_samples, seed, skip_warmup ): @@ -239,7 +240,6 @@ def build_dataset(index, name): data_impl=data_impl, skip_warmup=skip_warmup, documents=documents, - num_samples=train_valid_test_num_samples[index], seed=seed ) return dataset @@ -260,7 +260,6 @@ def __init__( data_impl, skip_warmup, documents, - num_samples, seed, ): @@ -282,7 +281,6 @@ def __init__( name=self.name, data_prefix=data_prefix, documents=documents, - num_samples=num_samples, seed=seed ) @@ -295,8 +293,8 @@ def __len__(self): def __getitem__(self, idx): # Get the shuffled index. idx = self.shuffle_idx[idx] - input_tokens = self.input_indexed_dataset[self.doc_idx[idx]] - target_tokens = self.target_indexed_dataset[self.doc_idx[idx]] + input_tokens = self.input_indexed_dataset.get(self.doc_idx[idx]) + target_tokens = self.target_indexed_dataset.get(self.doc_idx[idx]) return { 'input_tokens': np.array(input_tokens, dtype=np.int64), @@ -308,7 +306,6 @@ def _build_index_mappings( name, data_prefix, documents, - num_samples, seed, ): """Build doc-idx, sample-idx, and shuffle-idx. @@ -321,7 +318,6 @@ def _build_index_mappings( # Filename of the index mappings. _filename = data_prefix _filename += '_{}_indexmap'.format(name) - _filename += '_{}ns'.format(num_samples) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_mtf_doc_idx.npy' shuffle_idx_filename = _filename + '_mtf_shuffle_idx.npy' From 9a5bf96d15a1ff748acaa7660d3fdf4522dcd490 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 14:19:01 +0200 Subject: [PATCH 266/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 7e0a9cdde..608dd55df 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -288,7 +288,7 @@ def __init__( eos_token: int, seed, ): - self.mtf_dataset = MTFDataset(name=name, data_prefix=data_prefix, data_impl=data_impl, skip_warmup=skip_warmup, documents=documents, num_samples=num_samples, seed=seed) + self.mtf_dataset = MTFDataset(name=name, data_prefix=data_prefix, data_impl=data_impl, skip_warmup=skip_warmup, documents=documents, seed=seed) self.pad_token = pad_token self.seq_length = seq_length From d1605898c2316522c4ac6e4da1019098afe57c00 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 14:42:49 +0200 Subject: [PATCH 267/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 13 ++++++++++--- megatron/data/mtf_dataset.py | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 608dd55df..c639052ce 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -293,7 +293,7 @@ def __init__( self.pad_token = pad_token self.seq_length = seq_length - self.sample_index, self.shuffle_index = _build_index_mappings(name=name, data_prefix=data_prefix, mtf_dataset=self.mtf_dataset, num_samples=num_samples, seq_length=seq_length, seed=seed) + self.sample_index, self.shuffle_index = _build_index_mappings(name=name, data_prefix=data_prefix, documents=documents, mtf_dataset=self.mtf_dataset, num_samples=num_samples, seq_length=seq_length, seed=seed) def __len__(self): return len(self.sample_index) @@ -366,6 +366,7 @@ def pack_samples(self, items): def _build_index_mappings( name, data_prefix, + documents, mtf_dataset, num_samples: int, seq_length: int, @@ -402,11 +403,17 @@ def _build_index_mappings( shuffle_idx = [] sample_idx = [] while len(sample_idx) <= num_samples: - new_document_ids = _build_shuffle_idx(dataset_size=len(mtf_dataset), np_rng=np_rng) + # TODO @thomas21 we should pass the list of documents we have acccess to instead of dataset_size + new_document_ids = _build_shuffle_idx(documents=documents, np_rng=np_rng) # Generate a shuffling of the entire dataset shuffle_idx.append(new_document_ids) # Packs them into a single sample - new_samples, row_offset = _build_sample_idx(mtf_dataset=mtf_dataset, document_ids=new_document_ids ,seq_length=seq_length, row_offset=row_offset) + new_samples, row_offset = _build_sample_idx( + mtf_dataset=mtf_dataset, + document_ids=new_document_ids, + seq_length=seq_length, + row_offset=row_offset + ) sample_idx.extend(new_samples) shuffle_idx = np.concatenate(shuffle_idx, axis=0) diff --git a/megatron/data/mtf_dataset.py b/megatron/data/mtf_dataset.py index 42cd0acfd..72ed3de16 100644 --- a/megatron/data/mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -336,7 +336,7 @@ def _build_index_mappings( np.save(doc_idx_filename, doc_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) - shuffle_idx = _build_shuffle_idx(doc_idx.shape[0] - 1 , doc_idx.shape[0] - 1, np_rng) + shuffle_idx = _build_shuffle_idx(doc_idx.shape[0], doc_idx.shape[0], np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) print_rank_0(' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) From c3ab5b95c6e94e64c005f294eb6e18410e6cf739 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 14:46:33 +0200 Subject: [PATCH 268/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index c639052ce..0413c1a11 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -469,13 +469,9 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset): return full_samples, row_length -def _build_shuffle_idx(dataset_size, np_rng): +def _build_shuffle_idx(documents: np.array, np_rng): """Build the range [0, dataset_size) and shuffle.""" - dtype_ = np.uint32 - if dataset_size >= (np.iinfo(np.uint32).max - 1): - dtype_ = np.int64 - - result = np.arange(stop=dataset_size, step=1, dtype=dtype_) + result = np.copy(documents) # in-place shuffling np_rng.shuffle(result) From f5410765cfbe40fee83d677c5f96a405acb66fd2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 14:52:39 +0200 Subject: [PATCH 269/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 0413c1a11..df0898e55 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -404,7 +404,7 @@ def _build_index_mappings( sample_idx = [] while len(sample_idx) <= num_samples: # TODO @thomas21 we should pass the list of documents we have acccess to instead of dataset_size - new_document_ids = _build_shuffle_idx(documents=documents, np_rng=np_rng) + new_document_ids = _build_shuffle_idx(documents=documents, total_size=len(mtf_dataset), np_rng=np_rng) # Generate a shuffling of the entire dataset shuffle_idx.append(new_document_ids) # Packs them into a single sample @@ -469,9 +469,14 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset): return full_samples, row_length -def _build_shuffle_idx(documents: np.array, np_rng): +def _build_shuffle_idx(documents: np.array, total_size: int, np_rng): """Build the range [0, dataset_size) and shuffle.""" - result = np.copy(documents) + dtype_ = np.uint32 + if total_size >= (np.iinfo(np.uint32).max - 1): + dtype_ = np.int64 + + result = np.arange(start=0, stop=len(documents), + step=1, dtype=dtype_) # in-place shuffling np_rng.shuffle(result) From 20be5b903d7375876a94d242ccf7c5264e0e28ba Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:03:11 +0200 Subject: [PATCH 270/297] Requires to remember how may epochs --- megatron/data/decoder_packed_mtf_dataset.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index df0898e55..6cd7d1edb 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -400,10 +400,11 @@ def _build_index_mappings( # iteratively add the entire dataset for every epoch and see if it's enough given current packing strategy start_time = time.time() row_offset = 0 + old_sample_start = 0 + epoch = 0 shuffle_idx = [] sample_idx = [] while len(sample_idx) <= num_samples: - # TODO @thomas21 we should pass the list of documents we have acccess to instead of dataset_size new_document_ids = _build_shuffle_idx(documents=documents, total_size=len(mtf_dataset), np_rng=np_rng) # Generate a shuffling of the entire dataset shuffle_idx.append(new_document_ids) @@ -412,9 +413,12 @@ def _build_index_mappings( mtf_dataset=mtf_dataset, document_ids=new_document_ids, seq_length=seq_length, - row_offset=row_offset + row_offset=row_offset, + old_sample_start=old_sample_start, + epoch=epoch ) sample_idx.extend(new_samples) + epoch +=1 shuffle_idx = np.concatenate(shuffle_idx, axis=0) sample_idx = np.stack(sample_idx, axis=0) @@ -447,13 +451,14 @@ def _build_index_mappings( return sample_idx, shuffle_idx -def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset): +def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sample_start, epoch): """Build start and off index of each `full` batch, return that list of batch + start of the unfinished batch""" row_length = row_offset full_samples = [] - current_sample_start = 0 + current_sample_start = old_sample_start for current_sample_end, document_id in enumerate(document_ids): + current_sample_end = epoch * len(document_ids) + current_sample_end sample = mtf_dataset[document_id] # TODO @thomasw21 figure out if we add tokens @@ -467,7 +472,7 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset): row_length = tok_len - return full_samples, row_length + return full_samples, row_length, current_sample_start def _build_shuffle_idx(documents: np.array, total_size: int, np_rng): """Build the range [0, dataset_size) and shuffle.""" From d9719b6db822bc67baaa688890431e428919fd17 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:15:29 +0200 Subject: [PATCH 271/297] Find a way to reset states everytime --- tests/test_dataloaders.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 5b38a76df..c858a2abc 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -1,4 +1,6 @@ import itertools +import os +import shutil from typing import Set from unittest.mock import patch @@ -103,9 +105,22 @@ def setUp(self) -> None: MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" ) + def copy_data_to_temp(self, prefix): + """copy data to temp, and return paths to temp version""" + basename = os.path.basename(prefix) + tmp_dir = self.get_auto_remove_tmp_dir() + for folder in os.listdir(os.path.dirname(prefix)): + if folder.startswith(prefix): + if os.path.isdir(folder): + shutil.copytree(folder, tmp_dir) + else: + shutil.copy2(folder, tmp_dir) + return os.path.join(tmp_dir, basename) + def test_mlm_dataset(self): command_args = get_default_args() - command_args["--data-path"] = f"{self.data_dir}/gpt2/meg-gpt2-openwebtext_text_document" + data_path = self.copy_data_to_temp(f"{self.data_dir}/gpt2/meg-gpt2-openwebtext_text_document") + command_args["--data-path"] = data_path command_args["--noise_density"] = "0.15" command_args["--mean_noise_span_length"] = "3" command_args["--vocab-extra-ids"] = "100" @@ -149,7 +164,8 @@ def test_mlm_dataset(self): def test_mtf_dataset(self): command_args = get_default_args() - command_args["--data-path"] = f"{self.data_dir}/gpt2/ag_news_prompt" + data_path = self.copy_data_to_temp(f"{self.data_dir}/gpt2/ag_news_prompt") + command_args["--data-path"] = data_path command_args["--dataloader-type"] = "decoder_packed" with patch('sys.argv', flatten_arguments(command_args)): @@ -178,7 +194,8 @@ def test_mtf_dataset(self): def test_decoder_packed_mtf_dataloader(self): command_args = get_default_args() - command_args["--data-path"] = f"{self.data_dir}/gpt2/ag_news_prompt" + data_path = self.copy_data_to_temp(f"{self.data_dir}/gpt2/ag_news_prompt") + command_args["--data-path"] = data_path command_args["--dataloader-type"] = "decoder_packed" with patch('sys.argv', flatten_arguments(command_args)): From 4e0c4caf0aeefb13efb7cae9a37f406598963ba2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:24:18 +0200 Subject: [PATCH 272/297] Find a way to reset states everytime --- tests/test_dataloaders.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index c858a2abc..0a93e262e 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -105,21 +105,21 @@ def setUp(self) -> None: MASTER_ADDR="localhost", MASTER_PORT="9994", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" ) - def copy_data_to_temp(self, prefix): + def copy_data_to_temp(self, root_dir, prefix): """copy data to temp, and return paths to temp version""" - basename = os.path.basename(prefix) + src_path = os.path.join(root_dir, prefix) tmp_dir = self.get_auto_remove_tmp_dir() - for folder in os.listdir(os.path.dirname(prefix)): - if folder.startswith(prefix): + for folder in os.listdir(os.path.dirname(src_path)): + if folder.startswith(src_path): if os.path.isdir(folder): shutil.copytree(folder, tmp_dir) else: shutil.copy2(folder, tmp_dir) - return os.path.join(tmp_dir, basename) + return os.path.join(tmp_dir, prefix) def test_mlm_dataset(self): command_args = get_default_args() - data_path = self.copy_data_to_temp(f"{self.data_dir}/gpt2/meg-gpt2-openwebtext_text_document") + data_path = self.copy_data_to_temp(self.data_dir, "gpt2/meg-gpt2-openwebtext_text_document") command_args["--data-path"] = data_path command_args["--noise_density"] = "0.15" command_args["--mean_noise_span_length"] = "3" From 48a55b9a6414cccf0f014cbcdeaf3d648cdc902b Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:27:34 +0200 Subject: [PATCH 273/297] Find a way to reset states everytime --- tests/test_dataloaders.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 0a93e262e..ac311321f 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -108,9 +108,10 @@ def setUp(self) -> None: def copy_data_to_temp(self, root_dir, prefix): """copy data to temp, and return paths to temp version""" src_path = os.path.join(root_dir, prefix) + dirname = os.path.dirname(src_path) tmp_dir = self.get_auto_remove_tmp_dir() - for folder in os.listdir(os.path.dirname(src_path)): - if folder.startswith(src_path): + for folder in os.listdir(dirname): + if os.path.join(dirname, folder).startswith(src_path): if os.path.isdir(folder): shutil.copytree(folder, tmp_dir) else: From 2e469e5ac65cbdfd409632c743e23bbfa05330d8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:31:00 +0200 Subject: [PATCH 274/297] Find a way to reset states everytime --- tests/test_dataloaders.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index ac311321f..bb12c1084 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -109,6 +109,7 @@ def copy_data_to_temp(self, root_dir, prefix): """copy data to temp, and return paths to temp version""" src_path = os.path.join(root_dir, prefix) dirname = os.path.dirname(src_path) + os.makedirs(dirname, exist_ok=True) tmp_dir = self.get_auto_remove_tmp_dir() for folder in os.listdir(dirname): if os.path.join(dirname, folder).startswith(src_path): @@ -165,7 +166,7 @@ def test_mlm_dataset(self): def test_mtf_dataset(self): command_args = get_default_args() - data_path = self.copy_data_to_temp(f"{self.data_dir}/gpt2/ag_news_prompt") + data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt") command_args["--data-path"] = data_path command_args["--dataloader-type"] = "decoder_packed" @@ -195,7 +196,7 @@ def test_mtf_dataset(self): def test_decoder_packed_mtf_dataloader(self): command_args = get_default_args() - data_path = self.copy_data_to_temp(f"{self.data_dir}/gpt2/ag_news_prompt") + data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt") command_args["--data-path"] = data_path command_args["--dataloader-type"] = "decoder_packed" From 74e03ec45cb5af4a416c13d927e5a3ca0a7c991c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:35:02 +0200 Subject: [PATCH 275/297] Find a way to reset states everytime --- tests/test_dataloaders.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index bb12c1084..5e5b5afe1 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -108,16 +108,21 @@ def setUp(self) -> None: def copy_data_to_temp(self, root_dir, prefix): """copy data to temp, and return paths to temp version""" src_path = os.path.join(root_dir, prefix) - dirname = os.path.dirname(src_path) - os.makedirs(dirname, exist_ok=True) + src_dirname = os.path.dirname(src_path) + tmp_dir = self.get_auto_remove_tmp_dir() - for folder in os.listdir(dirname): - if os.path.join(dirname, folder).startswith(src_path): + dest_path = os.path.join(tmp_dir, prefix) + dest_dirname = os.path.dirname(tmp_dir) + os.makedirs(dest_dirname, exist_ok=True) + for folder in os.listdir(src_dirname): + src_folder = os.path.join(src_dirname, folder) + dest_folder = os.path.join(dest_dirname, folder) + if src_folder.startswith(src_path): if os.path.isdir(folder): - shutil.copytree(folder, tmp_dir) + shutil.copytree(src_folder, dest_folder) else: - shutil.copy2(folder, tmp_dir) - return os.path.join(tmp_dir, prefix) + shutil.copy2(src_folder, dest_folder) + return dest_path def test_mlm_dataset(self): command_args = get_default_args() From f4a4733e9c8d6236cee0edc333141c6bcd6c232e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 15:42:04 +0200 Subject: [PATCH 276/297] Fix bugs --- megatron/data/decoder_packed_mtf_dataset.py | 2 +- tests/test_dataloaders.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 6cd7d1edb..bbd3f45ad 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -409,7 +409,7 @@ def _build_index_mappings( # Generate a shuffling of the entire dataset shuffle_idx.append(new_document_ids) # Packs them into a single sample - new_samples, row_offset = _build_sample_idx( + new_samples, row_offset, old_sample_start = _build_sample_idx( mtf_dataset=mtf_dataset, document_ids=new_document_ids, seq_length=seq_length, diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 5e5b5afe1..1073eb98c 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -112,7 +112,7 @@ def copy_data_to_temp(self, root_dir, prefix): tmp_dir = self.get_auto_remove_tmp_dir() dest_path = os.path.join(tmp_dir, prefix) - dest_dirname = os.path.dirname(tmp_dir) + dest_dirname = os.path.dirname(dest_path) os.makedirs(dest_dirname, exist_ok=True) for folder in os.listdir(src_dirname): src_folder = os.path.join(src_dirname, folder) From e1a3767766f39c2f608b76f9ab91dfda263e93d9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 16:02:21 +0200 Subject: [PATCH 277/297] Cleanup --- megatron/data/data_samplers.py | 19 +------------------ .../fused_kernels/scaled_masked_softmax.h | 2 +- megatron/model/fused_softmax.py | 13 ++----------- megatron/tokenizer/tokenizer.py | 16 ++++++---------- pretrain_gpt.py | 18 ------------------ tests/test_model.py | 4 ---- tests/test_training.py | 1 + 7 files changed, 11 insertions(+), 62 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 05a86d39c..01b8f9405 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -15,7 +15,6 @@ """Dataloaders.""" -from functools import partial import torch from megatron import get_args @@ -31,7 +30,6 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): args = get_args() # Megatron sampler - collate_fn = None if args.dataloader_type == 'single': batch_sampler = MegatronPretrainingSampler( total_samples=len(dataset), @@ -54,7 +52,6 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) - collate_fn = concatenate_dict_of_tensor else: raise Exception('{} dataloader type is not supported.'.format( args.dataloader_type)) @@ -62,19 +59,12 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): if num_workers is None: num_workers = args.num_workers - collate_fn = None - # if args.dataloader_type == 'decoder_packed': - # assert isinstance(dataset, MTFDataset) - # pad_token = get_tokenizer().pad - # collate_fn = partial(pack_samples, max_seq_len=args.seq_length + 1, micro_batch_size=args.micro_batch_size, - # pad_token=pad_token) - # Torch dataloader. return torch.utils.data.DataLoader( dataset, batch_sampler=batch_sampler, num_workers=num_workers, - collate_fn=collate_fn, + collate_fn=None, pin_memory=True ) @@ -230,10 +220,3 @@ def __iter__(self): if len(batch) > 0 and not self.drop_last: start_idx, end_idx = self.get_start_end_idx() yield batch[start_idx:end_idx] - -def concatenate_dict_of_tensor(list_dict_of_tensors): - keys = list(list_dict_of_tensors[0].keys()) - result = {} - for key in keys: - result[key] = torch.stack([sample[key] for sample in list_dict_of_tensors]) - return result diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h index 73bbe65a8..013dd8366 100644 --- a/megatron/fused_kernels/scaled_masked_softmax.h +++ b/megatron/fused_kernels/scaled_masked_softmax.h @@ -738,4 +738,4 @@ void dispatch_scaled_masked_softmax_backward( break; } } -} \ No newline at end of file +} diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 30b3a5262..3a407fc0e 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -157,16 +157,7 @@ def forward(self, input, mask): assert input.dim() == 4 if self.is_kernel_available(mask, *input.size()): - result = self.forward_fused_softmax(input, mask) - for batch_id in range(len(mask)): - print("Batch id", batch_id) - print(" inputs", input.shape, input[batch_id, 0]) - print(" mask", mask.shape, mask[batch_id, 0]) - print(" result", result.shape, result[batch_id, 0]) - print(" hello", torch.nonzero(~mask[batch_id, 0])[100:150]) - print(" bye", torch.nonzero(result[batch_id, 0])[41:100]) - print(" all ones?", torch.sum(result, dim=-1)) - return result + return self.forward_fused_softmax(input, mask) else: return self.forward_torch_softmax(input, mask) @@ -197,7 +188,7 @@ def forward_fused_softmax(self, input, mask): if self.attn_mask_type == AttnMaskType.causal: assert sq == sk, "causal mask is only for self attention" - assert mask is None + assert mask is None, "Mask is silently ignored due to the use of a custom kernel" # input is 3D tensor (attn_batches, sq, sk) input = input.view(-1, sq, sk) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 9f4e5b772..09304b1dd 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -361,26 +361,22 @@ def eod(self): @property def cls(self): candidate = self.tokenizer.cls_token_id - self._check_token_candidate(candidate) - return candidate + return self._check_token_candidate(candidate) @property def sep(self): candidate = self.tokenizer.sep_token_id - self._check_token_candidate(candidate) - return candidate + return self._check_token_candidate(candidate) @property def pad(self): candidate = self.tokenizer.pad_token_id - self._check_token_candidate(candidate) - return candidate + return self._check_token_candidate(candidate) @property def mask(self): candidate = self.tokenizer.mask_token_id - self._check_token_candidate(candidate) - return candidate + return self._check_token_candidate(candidate) @property def bos(self): @@ -390,8 +386,7 @@ def bos(self): def eos(self): # TODO @thomasw21 might conflict with the notion of candidate = self.tokenizer.eos_token_id - self._check_token_candidate(candidate) - return candidate + return self._check_token_candidate(candidate) @property def additional_special_tokens_ids(self): @@ -402,3 +397,4 @@ def additional_special_tokens_ids(self): def _check_token_candidate(candidate): if candidate is None: raise AttributeError("Token doesn't exist") + return candidate diff --git a/pretrain_gpt.py b/pretrain_gpt.py index faa45050c..0138f50e2 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -53,24 +53,6 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed: - # We don't need it. - # # Precompute the attention mask and store it in args. This avoids having to - # # pipeline it as an activation during training. The mask is constant, and thus - # # we can reuse it. - # attention_mask = torch.tril(torch.ones( - # (1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view( - # 1, 1, args.seq_length, args.seq_length) - # - # # Convert attention mask to binary: - # attention_mask = (attention_mask < 0.5) - # if args.fp16: - # attention_mask = attention_mask.half() - # elif args.bf16: - # attention_mask = attention_mask.bfloat16() - # - # # must be bool or the training crashes expecting bool, but getting Half - # args.attn_mask = attention_mask.to(torch.bool) - model = GPTModelPipe( num_tokentypes=0, parallel_output=True diff --git a/tests/test_model.py b/tests/test_model.py index 1026f6acc..3c90cede3 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -408,9 +408,6 @@ def test_fused_masked_softmax(self): def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self): - # TODO @thomasw21 make sure that if pass a causal mask, it is take in account. The following shows that fused_kernel completely ignores the masking is we set the variable incorrectly. - # https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/131bd43e9f3552f2413a442f51c22214d4f6fb19/megatron/model/fused_softmax.py#L190 - # Maybe we should pass None is case as attention_mask instead of silently ignoring mask. command_args = get_default_args(self.test_file_dir_str) command_args["--position-embedding-type"] = "alibi" @@ -479,5 +476,4 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i # Test model handles padding correctly output_changed_pad = model.eval_batch(iter_out_of_one({**data, "decoder_token_ids": token_ids_changed_pad, "decoder_segment_ids": segment_ids_changed_pad}), compute_loss=False) - print(output_changed_pad) self.assertFalse(torch.any(torch.isnan(output_changed_pad))) diff --git a/tests/test_training.py b/tests/test_training.py index d877136e2..6031f1bcb 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -490,6 +490,7 @@ def test_training_t0(self): --num-attention-heads 2 --seq-length 128 --max-position-embeddings 1024 + --position-embedding-type alibi --micro-batch-size 1 --rampup-batch-size 2 2 {n_samples} --global-batch-size 16 From de88ab63208e05e133d8fe9d97ed7393b2f17fa9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 16:34:26 +0200 Subject: [PATCH 278/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 10 ++++------ tests/test_dataloaders.py | 5 ++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index bbd3f45ad..b091e7218 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -405,7 +405,7 @@ def _build_index_mappings( shuffle_idx = [] sample_idx = [] while len(sample_idx) <= num_samples: - new_document_ids = _build_shuffle_idx(documents=documents, total_size=len(mtf_dataset), np_rng=np_rng) + new_document_ids = _build_shuffle_idx(documents=documents, np_rng=np_rng) # Generate a shuffling of the entire dataset shuffle_idx.append(new_document_ids) # Packs them into a single sample @@ -418,7 +418,7 @@ def _build_index_mappings( epoch=epoch ) sample_idx.extend(new_samples) - epoch +=1 + epoch += 1 shuffle_idx = np.concatenate(shuffle_idx, axis=0) sample_idx = np.stack(sample_idx, axis=0) @@ -474,11 +474,9 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam return full_samples, row_length, current_sample_start -def _build_shuffle_idx(documents: np.array, total_size: int, np_rng): +def _build_shuffle_idx(documents: np.array, np_rng): """Build the range [0, dataset_size) and shuffle.""" - dtype_ = np.uint32 - if total_size >= (np.iinfo(np.uint32).max - 1): - dtype_ = np.int64 + dtype_ = np.int64 result = np.arange(start=0, stop=len(documents), step=1, dtype=dtype_) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 1073eb98c..38caddd60 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -128,8 +128,8 @@ def test_mlm_dataset(self): command_args = get_default_args() data_path = self.copy_data_to_temp(self.data_dir, "gpt2/meg-gpt2-openwebtext_text_document") command_args["--data-path"] = data_path - command_args["--noise_density"] = "0.15" - command_args["--mean_noise_span_length"] = "3" + command_args["--noise-density"] = "0.15" + command_args["--mean-noise-span-length"] = "3" command_args["--vocab-extra-ids"] = "100" with patch('sys.argv', flatten_arguments(command_args)): @@ -307,7 +307,6 @@ def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): torch_assert_equal(tokens, data["decoder_token_ids"][:, :-1]) torch_assert_equal(labels, data["decoder_token_ids"][:, 1:]) - # TODO @thomasw21 check that attention_mask is `1` between segments, ie segments are independent for batch_id in range(args.micro_batch_size): segment_cuts = torch.nonzero(data["decoder_segment_ids"][batch_id, 1:] - data["decoder_segment_ids"][batch_id, :-1]) + 1 for segment_start, segment_end in zip([0, *segment_cuts], [*segment_cuts, args.seq_length]): From d7a6388a495a4d43c9224c3112bc691e5d8bc153 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 17:41:48 +0200 Subject: [PATCH 279/297] Woops --- finetune_t0_non_causal_decoder.py | 2 +- megatron/data/decoder_packed_mtf_dataset.py | 1 + megatron/data/mlm_dataset.py | 8 ++------ 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 9e780d31a..97045b366 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -117,7 +117,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): data_prefix=args.data_path, data_impl=args.data_impl, splits_string=args.split, - seq_length=args.seq_length, + seq_length=args.seq_length + 1, pad_token=tokenizer.pad, eos_token=tokenizer.eos, train_valid_test_num_samples=train_val_test_num_samples, diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index b091e7218..d8f80b5a0 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -457,6 +457,7 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam full_samples = [] current_sample_start = old_sample_start + assert epoch * len(document_ids) > current_sample_start for current_sample_end, document_id in enumerate(document_ids): current_sample_end = epoch * len(document_ids) + current_sample_end sample = mtf_dataset[document_id] diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py index 4ac4624b1..872825c78 100644 --- a/megatron/data/mlm_dataset.py +++ b/megatron/data/mlm_dataset.py @@ -327,12 +327,8 @@ def __init__( assert len(self.sentinel_token_ids) >= self.num_noise_spans, "Not enough sentinel tokens, please add more" args = get_args() - if hasattr(args, "encoder_seq_length") and args.encoder_seq_length is not None: - # T5 style - assert self.inputs_length == args.encoder_seq_length - assert self.targets_length == args.decoder_seq_length + 1 - else: - assert self.inputs_length + self.targets_length == args.seq_length + # TODO @thomasw21 check once we merge t5 + assert self.inputs_length + self.targets_length == args.seq_length def __len__(self): return len(self._gpt_dataset) From 1c2284f127e6d23aeca0929a7648eefaa05c7986 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 17:48:54 +0200 Subject: [PATCH 280/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 5 ++--- tests/test_dataloaders.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index d8f80b5a0..9781eb11d 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -457,7 +457,7 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam full_samples = [] current_sample_start = old_sample_start - assert epoch * len(document_ids) > current_sample_start + assert epoch * len(document_ids) >= current_sample_start for current_sample_end, document_id in enumerate(document_ids): current_sample_end = epoch * len(document_ids) + current_sample_end sample = mtf_dataset[document_id] @@ -479,8 +479,7 @@ def _build_shuffle_idx(documents: np.array, np_rng): """Build the range [0, dataset_size) and shuffle.""" dtype_ = np.int64 - result = np.arange(start=0, stop=len(documents), - step=1, dtype=dtype_) + result = np.arange(start=0, stop=len(documents),step=1, dtype=dtype_) # in-place shuffling np_rng.shuffle(result) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 38caddd60..e69b918c8 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -154,7 +154,7 @@ def test_mlm_dataset(self): splits_string=args.split, # TODO @thomasw21 figure how that value works train_valid_test_num_samples=train_val_test_num_samples, - sequence_length=args.seq_length, + sequence_length=args.seq_length + 1, noise_density=args.noise_density, mean_noise_span_length=args.mean_noise_span_length, seed=args.seed, From b759a92a1169d9a2847b7b691e154f32bb46c423 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 18:21:16 +0200 Subject: [PATCH 281/297] Woops --- megatron/data/mlm_dataset.py | 2 +- megatron/data/mtf_dataset.py | 2 +- tests/test_dataloaders.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py index 872825c78..e3a1d9d34 100644 --- a/megatron/data/mlm_dataset.py +++ b/megatron/data/mlm_dataset.py @@ -328,7 +328,7 @@ def __init__( args = get_args() # TODO @thomasw21 check once we merge t5 - assert self.inputs_length + self.targets_length == args.seq_length + assert self.inputs_length + self.targets_length == args.seq_length + 1 def __len__(self): return len(self._gpt_dataset) diff --git a/megatron/data/mtf_dataset.py b/megatron/data/mtf_dataset.py index 72ed3de16..3e61bde7c 100644 --- a/megatron/data/mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -262,7 +262,6 @@ def __init__( documents, seed, ): - # Params to store. self.name = name @@ -318,6 +317,7 @@ def _build_index_mappings( # Filename of the index mappings. _filename = data_prefix _filename += '_{}_indexmap'.format(name) + _filename += '_{}docs'.format(len(documents)) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_mtf_doc_idx.npy' shuffle_idx_filename = _filename + '_mtf_shuffle_idx.npy' diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index e69b918c8..38caddd60 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -154,7 +154,7 @@ def test_mlm_dataset(self): splits_string=args.split, # TODO @thomasw21 figure how that value works train_valid_test_num_samples=train_val_test_num_samples, - sequence_length=args.seq_length + 1, + sequence_length=args.seq_length, noise_density=args.noise_density, mean_noise_span_length=args.mean_noise_span_length, seed=args.seed, From ef20e57a5af52afdc9f87882aac408274ca12f43 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 18:44:14 +0200 Subject: [PATCH 282/297] Woops --- megatron/data/mlm_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/mlm_dataset.py b/megatron/data/mlm_dataset.py index e3a1d9d34..dcc66d2c0 100644 --- a/megatron/data/mlm_dataset.py +++ b/megatron/data/mlm_dataset.py @@ -314,7 +314,7 @@ def __init__( indexed_dataset=self.indexed_dataset, num_samples=num_samples, # -1 because GPTDataset will return `seq_length + 1` sequences. - seq_length=number_of_raw_tokens - 1, + seq_length=self.number_of_raw_tokens - 1, seed=seed ) From 5816adfb48df56a5ecdb62045381afa09724cde1 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 21:44:50 +0200 Subject: [PATCH 283/297] Silently skip samples that are too long --- megatron/data/decoder_packed_mtf_dataset.py | 19 ++++++++++++++++++- megatron/data/mtf_dataset.py | 3 +++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 9781eb11d..c4e20382b 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -336,13 +336,23 @@ def pack_samples(self, items): # `0` is reserved for padding item_num = 1 cur_len = 0 + + assert len(items) > 0 + for token_dict in items: input_token_len = len(token_dict["input_tokens"]) target_token_len = len(token_dict["target_tokens"]) + total_len = input_token_len + target_token_len if cur_len + total_len > self.seq_length: - break + # This should not happen at the indexing should only allow the correct number of items + raise ValueError(f"""Items to be packed do not fit inside a single sample. + current length: {cur_len} + input tokens length: {input_token_len} + target token length: {target_token_len} + expected sequence length: {self.seq_length} + """) decoder_tokens[cur_len: cur_len + input_token_len] = torch.from_numpy(token_dict["input_tokens"]) decoder_tokens[cur_len + input_token_len: cur_len + total_len] = torch.from_numpy( @@ -472,6 +482,13 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam current_sample_start = current_sample_end row_length = tok_len + if tok_len > seq_length: + # TODO @thomasw21 handle the case where a single sample cannot fit inside a row. We can + # - silently skip that value [currently implemented] + # - truncate to `seq_length`, and keep the right part + current_sample_start += 1 + row_length = 0 + return full_samples, row_length, current_sample_start diff --git a/megatron/data/mtf_dataset.py b/megatron/data/mtf_dataset.py index 3e61bde7c..99fc8c095 100644 --- a/megatron/data/mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -295,6 +295,9 @@ def __getitem__(self, idx): input_tokens = self.input_indexed_dataset.get(self.doc_idx[idx]) target_tokens = self.target_indexed_dataset.get(self.doc_idx[idx]) + assert len(input_tokens) > 0 + assert len(target_tokens) > 0 + return { 'input_tokens': np.array(input_tokens, dtype=np.int64), 'target_tokens': np.array(target_tokens, dtype=np.int64), From 37ad57e66583e8e63cd4bb67194fcf0d74973dad Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 21:52:06 +0200 Subject: [PATCH 284/297] Build the index from scratch everytime --- tests/test_training.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/tests/test_training.py b/tests/test_training.py index 6031f1bcb..08c06aa7a 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -18,6 +18,7 @@ import os import glob import re +import shutil import unittest from pathlib import Path from parameterized import parameterized @@ -80,8 +81,27 @@ def setUp(self): if os.path.exists(meg_lock_file_path): os.unlink(meg_lock_file_path) + def copy_data_to_temp(self, root_dir, prefix): + """copy data to temp, and return paths to temp version""" + src_path = os.path.join(root_dir, prefix) + src_dirname = os.path.dirname(src_path) + + tmp_dir = self.get_auto_remove_tmp_dir() + dest_path = os.path.join(tmp_dir, prefix) + dest_dirname = os.path.dirname(dest_path) + os.makedirs(dest_dirname, exist_ok=True) + for folder in os.listdir(src_dirname): + src_folder = os.path.join(src_dirname, folder) + dest_folder = os.path.join(dest_dirname, folder) + if src_folder.startswith(src_path): + if os.path.isdir(folder): + shutil.copytree(src_folder, dest_folder) + else: + shutil.copy2(src_folder, dest_folder) + return dest_path + def get_variation_config(self, variation, output_dir, n_samples=None): - data_dir = f"{self.data_dir}/gpt2" + data_dir = self.copy_data_to_tempf(self.data_dir,"gpt2") pp_size, tp_size, dp_size = get_3d_dimensions() num_gpus = pp_size * tp_size * dp_size @@ -354,7 +374,8 @@ def test_training_all(self, variation): def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_on_position_frequency): # all in one test src_dir = self.src_dir - data_dir = f"{self.data_dir}/gpt2" + data_dir = self.copy_data_to_temp(self.data_dir,"gpt2") + output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False) logs_dir = f"{output_dir}/logs" Path(logs_dir).mkdir(parents=True, exist_ok=True) @@ -469,7 +490,7 @@ def test_training_prefix_lm_all(self, loss_on_targets_only, reweight_loss_based_ self.assertEqual(len(tensorboard_files), 2, "tensorboard files") def test_training_t0(self): - data_path = f"{self.data_dir}/gpt2/ag_news_prompt" + data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt") output_dir = self.get_auto_remove_tmp_dir() logs_dir = f"{output_dir}/logs" Path(logs_dir).mkdir(parents=True, exist_ok=True) @@ -584,7 +605,7 @@ def test_training_t0(self): @parameterized.expand(["gpt", "prefix", "no_eval"]) def test_mode2_dataloading(self, variation): src_dir = self.src_dir - data_dir = f"{self.data_dir}/gpt2" + data_dir = self.copy_data_to_temp(self.data_dir, "gpt2") output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False) logs_dir = f"{output_dir}/logs" Path(logs_dir).mkdir(parents=True, exist_ok=True) From 1572ddc9c77526339b3780381d46394d6cbf0f15 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 21:57:46 +0200 Subject: [PATCH 285/297] Prevent empty dataset --- megatron/data/decoder_packed_mtf_dataset.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index c4e20382b..f24c798f7 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -475,6 +475,17 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam # TODO @thomasw21 figure out if we add tokens tok_len = len(sample["input_tokens"]) + len(sample["target_tokens"]) + if tok_len > seq_length: + # TODO @thomasw21 handle the case where a single sample cannot fit inside a row. We can + # - silently skip that value [currently implemented] + # - truncate to `seq_length`, and keep the right part + + # Detect is the the sample is the first one. + if row_length != 0: + full_samples.append(np.asarray([current_sample_start, current_sample_end])) + current_sample_start = current_sample_end + 1 # skipping + row_length = 0 + row_length = row_length + tok_len if row_length > seq_length: # current sample can't be added and requires to be added in the next one @@ -482,12 +493,7 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam current_sample_start = current_sample_end row_length = tok_len - if tok_len > seq_length: - # TODO @thomasw21 handle the case where a single sample cannot fit inside a row. We can - # - silently skip that value [currently implemented] - # - truncate to `seq_length`, and keep the right part - current_sample_start += 1 - row_length = 0 + return full_samples, row_length, current_sample_start From bebb481a1a3496686e85875245f8f8e328d67a0e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 2 Jul 2022 22:00:50 +0200 Subject: [PATCH 286/297] Change the condition for empty slice --- megatron/data/decoder_packed_mtf_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index f24c798f7..8336d5e68 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -481,7 +481,7 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam # - truncate to `seq_length`, and keep the right part # Detect is the the sample is the first one. - if row_length != 0: + if current_sample_end - 1 > current_sample_start: full_samples.append(np.asarray([current_sample_start, current_sample_end])) current_sample_start = current_sample_end + 1 # skipping row_length = 0 From 5c806992bb414c6d1cc2f33b16056619e5b997a6 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sun, 3 Jul 2022 13:56:10 +0200 Subject: [PATCH 287/297] PR reviews --- finetune_t0_non_causal_decoder.py | 3 +- megatron/arguments.py | 2 +- megatron/data/data_samplers.py | 58 ------- megatron/data/decoder_packed_mtf_dataset.py | 42 ++--- megatron/data/non_causal_mlm_dataset.py | 165 -------------------- megatron/model/fused_softmax.py | 7 + megatron/model/transformer.py | 6 - megatron/training.py | 8 +- tests/data/gpt2/README.md | 3 + tests/test_dataloaders.py | 15 +- tests/test_model.py | 11 +- tests/test_training.py | 8 +- 12 files changed, 45 insertions(+), 283 deletions(-) delete mode 100644 megatron/data/non_causal_mlm_dataset.py create mode 100644 tests/data/gpt2/README.md diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 97045b366..96899503d 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -98,7 +98,7 @@ def get_batch_pipe(data): if args.position_embedding_type not in [PositionEmbeddingType.alibi, PositionEmbeddingType.rotary]: raise NotImplementedError("absolute positional embeddings require us to reset position_ids accordingly.") - return (tokens, position_ids, attention_mask), (labels, loss_mask) + return (tokens, None, attention_mask), (labels, loss_mask) def train_valid_test_datasets_provider(train_val_test_num_samples): @@ -110,7 +110,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): print_rank_0("> building train, validation, and test datasets for T0 ...") # Option 1 of data loading using --data-path - # For T0, data has to be provided in the form --data-path input-data target-data input-data2 target-data2 ... if args.data_path: # TODO: Not yet compatible with dataset weights (Will break at prefixes, weights = analyze_data_prefix(args.data_path)) train_ds, valid_ds, test_ds = build_train_valid_test_datasets( diff --git a/megatron/arguments.py b/megatron/arguments.py index 31a8d4000..02ba858e1 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -553,7 +553,7 @@ def _add_training_args(parser): 'please refer https://github.com/facebookresearch/bitsandbytes.', dest='use_bnb_optimizer') group.add_argument('--dataloader-type', type=str, default=None, - choices=['single', 'cyclic', 'decoder_packed'], + choices=['single', 'cyclic'], help='Single pass vs multiple pass data loader') group.add_argument('--cpu-optimizer', action='store_true', help='Run optimizer on CPU') diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 01b8f9405..0aa422974 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -44,14 +44,6 @@ def build_pretraining_data_loader(dataset, consumed_samples, num_workers=None): micro_batch_size=args.micro_batch_size, data_parallel_rank=mpu.get_data_parallel_rank(), data_parallel_size=mpu.get_data_parallel_world_size()) - elif args.dataloader_type == 'decoder_packed': - assert isinstance(dataset, DecoderPackedMTFDataset) - batch_sampler = MegatronDecoderPackedText2TextSampler( - total_samples=len(dataset), - consumed_samples=consumed_samples, - micro_batch_size=args.micro_batch_size, - data_parallel_rank=mpu.get_data_parallel_rank(), - data_parallel_size=mpu.get_data_parallel_world_size()) else: raise Exception('{} dataloader type is not supported.'.format( args.dataloader_type)) @@ -170,53 +162,3 @@ def __iter__(self): self.consumed_samples += self.micro_batch_times_data_parallel_size yield batch batch = [] - - -class MegatronDecoderPackedText2TextSampler(object): - """Sampler used with `DecoderPackedMTFDataset""" - - def __init__(self, total_samples, consumed_samples, micro_batch_size, - data_parallel_rank, data_parallel_size, drop_last=True): - # Keep a copy of input params for later use. - self.total_samples = total_samples - self.consumed_samples = consumed_samples - self.micro_batch_size = micro_batch_size - self.data_parallel_rank = data_parallel_rank - self.micro_batch_times_data_parallel_size = \ - self.micro_batch_size * data_parallel_size - self.drop_last = drop_last - - # Sanity checks. - assert self.total_samples > 0, \ - 'no sample to consume: {}'.format(self.total_samples) - assert self.consumed_samples < self.total_samples, \ - 'no samples left to consume: {}, {}'.format(self.consumed_samples, - self.total_samples) - assert self.micro_batch_size > 0 - assert data_parallel_size > 0 - assert self.data_parallel_rank < data_parallel_size, \ - 'data_parallel_rank should be smaller than data size: {}, ' \ - '{}'.format(self.data_parallel_rank, data_parallel_size) - - def __len__(self): - return self.total_samples - - def get_start_end_idx(self): - start_idx = self.data_parallel_rank * self.micro_batch_size - end_idx = start_idx + self.micro_batch_size - return start_idx, end_idx - - def __iter__(self): - batch = [] - # Last batch will be dropped if drop_last is not set False - for idx in range(self.consumed_samples, self.total_samples): - batch.append(idx) - if len(batch) == self.micro_batch_times_data_parallel_size: - start_idx, end_idx = self.get_start_end_idx() - yield batch[start_idx:end_idx] - batch = [] - - # Check the last partial batch and see drop_last is set - if len(batch) > 0 and not self.drop_last: - start_idx, end_idx = self.get_start_end_idx() - yield batch[start_idx:end_idx] diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 8336d5e68..85e41970b 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -293,7 +293,7 @@ def __init__( self.pad_token = pad_token self.seq_length = seq_length - self.sample_index, self.shuffle_index = _build_index_mappings(name=name, data_prefix=data_prefix, documents=documents, mtf_dataset=self.mtf_dataset, num_samples=num_samples, seq_length=seq_length, seed=seed) + self.sample_index, self.shuffle_index = _build_index_mappings(name=name, data_prefix=data_prefix, nb_documents=len(documents), mtf_dataset=self.mtf_dataset, num_samples=num_samples, seq_length=seq_length, seed=seed) def __len__(self): return len(self.sample_index) @@ -329,9 +329,9 @@ def pack_samples(self, items): decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]: `1` depicts inputs, `0` depicts target. """ - decoder_tokens = torch.full((self.seq_length,), self.pad_token, dtype=torch.int64) - decoder_segment_ids = torch.zeros((self.seq_length,), dtype=torch.int64) - decoder_is_inputs = torch.full((self.seq_length,), False, dtype=torch.bool) + decoder_tokens = np.full((self.seq_length,), self.pad_token, dtype=torch.int64) + decoder_segment_ids = np.zeros((self.seq_length,), dtype=np.int64) + decoder_is_inputs = np.full((self.seq_length,), False, dtype=bool) # `0` is reserved for padding item_num = 1 @@ -354,9 +354,8 @@ def pack_samples(self, items): expected sequence length: {self.seq_length} """) - decoder_tokens[cur_len: cur_len + input_token_len] = torch.from_numpy(token_dict["input_tokens"]) - decoder_tokens[cur_len + input_token_len: cur_len + total_len] = torch.from_numpy( - token_dict["target_tokens"]) + decoder_tokens[cur_len: cur_len + input_token_len] = token_dict["input_tokens"] + decoder_tokens[cur_len + input_token_len: cur_len + total_len] = token_dict["target_tokens"] decoder_segment_ids[cur_len: cur_len + total_len] = item_num decoder_is_inputs[cur_len: cur_len + input_token_len] = 1 # inputs # targets are already 0 at init, no need to update `decoder_is_inputs` @@ -365,7 +364,6 @@ def pack_samples(self, items): cur_len += total_len assert cur_len < self.seq_length - # Normally the default collate_fn handles torch tensor conversion; As we use a custom collate_fn, do it here return { "decoder_token_ids": decoder_tokens, "decoder_segment_ids": decoder_segment_ids, @@ -376,7 +374,7 @@ def pack_samples(self, items): def _build_index_mappings( name, data_prefix, - documents, + nb_documents, mtf_dataset, num_samples: int, seq_length: int, @@ -415,7 +413,7 @@ def _build_index_mappings( shuffle_idx = [] sample_idx = [] while len(sample_idx) <= num_samples: - new_document_ids = _build_shuffle_idx(documents=documents, np_rng=np_rng) + new_document_ids = _build_shuffle_idx(nb_documents=nb_documents, np_rng=np_rng) # Generate a shuffling of the entire dataset shuffle_idx.append(new_document_ids) # Packs them into a single sample @@ -475,17 +473,6 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam # TODO @thomasw21 figure out if we add tokens tok_len = len(sample["input_tokens"]) + len(sample["target_tokens"]) - if tok_len > seq_length: - # TODO @thomasw21 handle the case where a single sample cannot fit inside a row. We can - # - silently skip that value [currently implemented] - # - truncate to `seq_length`, and keep the right part - - # Detect is the the sample is the first one. - if current_sample_end - 1 > current_sample_start: - full_samples.append(np.asarray([current_sample_start, current_sample_end])) - current_sample_start = current_sample_end + 1 # skipping - row_length = 0 - row_length = row_length + tok_len if row_length > seq_length: # current sample can't be added and requires to be added in the next one @@ -493,16 +480,21 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam current_sample_start = current_sample_end row_length = tok_len - - + if tok_len > seq_length: + # TODO @thomasw21 handle the case where a single sample cannot fit inside a row. We can + # - silently skip that value [currently implemented] + # - truncate to `seq_length`, and keep the right part + current_sample_start = current_sample_end + 1 # skipping + row_length = 0 + continue return full_samples, row_length, current_sample_start -def _build_shuffle_idx(documents: np.array, np_rng): +def _build_shuffle_idx(nb_documents: int, np_rng): """Build the range [0, dataset_size) and shuffle.""" dtype_ = np.int64 - result = np.arange(start=0, stop=len(documents),step=1, dtype=dtype_) + result = np.arange(start=0, stop=nb_documents, step=1, dtype=dtype_) # in-place shuffling np_rng.shuffle(result) diff --git a/megatron/data/non_causal_mlm_dataset.py b/megatron/data/non_causal_mlm_dataset.py deleted file mode 100644 index d5f435d37..000000000 --- a/megatron/data/non_causal_mlm_dataset.py +++ /dev/null @@ -1,165 +0,0 @@ -# coding=utf-8 -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""T5 Style dataset.""" - -import collections - -import numpy as np -import torch - -from megatron import get_tokenizer -from megatron.data.dataset_utils import ( - create_masked_lm_predictions, - get_samples_mapping -) - -class NonCausalMLMDataset(torch.utils.data.Dataset): - - def __init__(self, name, indexed_dataset, data_prefix, - num_epochs, max_num_samples, masked_lm_prob, - max_seq_length, - short_seq_prob, seed): - - # Params to store. - self.name = name - self.seed = seed - self.masked_lm_prob = masked_lm_prob - self.max_seq_length = max_seq_length - - # Dataset. - self.indexed_dataset = indexed_dataset - - # Build the samples mapping. - self.samples_mapping = get_samples_mapping(self.indexed_dataset, - data_prefix, - num_epochs, - max_num_samples, - self.max_seq_length - 2, # account for added tokens - short_seq_prob, - self.seed, - self.name, - False) - - # Vocab stuff. - tokenizer = get_tokenizer() - self.vocab_id_list = list(tokenizer.inv_vocab.keys()) - self.vocab_id_to_token_dict = tokenizer.inv_vocab - self.cls_id = tokenizer.cls - self.sep_id = tokenizer.sep - self.mask_id = tokenizer.mask - self.pad_id = tokenizer.pad - self.bos_id = tokenizer.bos_token_id - self.eos_id = tokenizer.eos_token_id - self.sentinel_tokens = tokenizer.additional_special_tokens_ids - assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" - - def __len__(self): - return self.samples_mapping.shape[0] - - def __getitem__(self, idx): - - start_index, end_index, seq_length = self.samples_mapping[idx] - sample = [] - for index in range(start_index, end_index): - sample.append(self.indexed_dataset[index]) - # Note that this rng state should be numpy and not python since - # python randint is inclusive whereas the numpy one is exclusive. - np_rng = np.random.RandomState(seed=(self.seed + idx)) - return build_training_sample(sample, - self.max_seq_length, # needed for padding - self.vocab_id_list, - self.vocab_id_to_token_dict, - self.cls_id, self.sep_id, - self.mask_id, self.pad_id, - self.masked_lm_prob, np_rng, - self.bos_id, self.eos_id, - self.sentinel_tokens) - - -def build_training_sample(sample, - max_seq_length, - vocab_id_list, vocab_id_to_token_dict, - cls_id, sep_id, mask_id, pad_id, - masked_lm_prob, np_rng, bos_id=None, - eos_id=None, sentinel_tokens=None): - """Build training sample. - - Arguments: - sample: A list of sentences in which each sentence is a list token ids. - max_seq_length: Maximum length of the sequence. All values are padded to - this length. - vocab_id_list: List of vocabulary ids. Used to pick a random id. - vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. - cls_id: Start of example id. - sep_id: Separator id. - mask_id: Mask token id. - pad_id: Padding token id. - masked_lm_prob: Probability to mask tokens. - np_rng: Random number genenrator. Note that this rng state should be - numpy and not python since python randint is inclusive for - the opper bound whereas the numpy one is exclusive. - bos_id: start of decoder example id - eos_id: end of generation id - sentinel_tokens: unique value to be substituted for every replaced span - """ - - # flatten sentences into one list - tokens = [token for sentence in sample for token in sentence] - - # Truncate to `target_sequence_length`. - max_num_tokens = max_seq_length - truncated = len(tokens) > max_num_tokens - tokens = tokens[:max_num_tokens] - - # Masking. - max_predictions_per_seq = masked_lm_prob * max_num_tokens - (tokens, masks, labels, _, masked_spans) = create_masked_lm_predictions( - tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, - cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng - ) - - # Padding. - padded_tokens = pad_and_convert_to_numpy(tokens, max_seq_length) - padded_labels = pad_and_convert_to_numpy(labels, max_seq_length) - padded_masks = pad_and_convert_to_numpy(masks, max_seq_length) - - print(padded_tokens) - print(padded_labels) - import sys - sys.exit() - - train_sample = { - 'text': padded_tokens, - 'labels': padded_labels, - 'mask': padded_masks, - 'prefix_len': 0 - } - return train_sample - - -def pad_and_convert_to_numpy(tokens, pad_id, max_seq_length): - """Pad sequences and convert them to numpy.""" - - # Some checks. - num_tokens = len(tokens) - padding_length = max_seq_length - num_tokens - assert padding_length >= 0 - - # Tokens and token types. - filler = np.array([pad_id] * padding_length) - tokens_np = np.concatenate((tokens, filler), dtype=np.int64) - - return tokens_np \ No newline at end of file diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 3a407fc0e..e2983a75d 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -207,7 +207,14 @@ def forward_torch_softmax(self, input, mask): if self.scale is not None: input = input * self.scale + + if self.attn_mask_type == AttnMaskType.causal: + assert mask is None + mask = torch.ones_like(input, dtype=torch.bool) + mask = torch.triu(mask, diagonal=1, out=mask) + mask_output = self.mask_func(input, mask) if mask is not None else input + probs = torch.nn.Softmax(dim=-1)(mask_output) if self.input_in_float16 and self.softmax_in_fp32: diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 9232d84a5..03e6faaec 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -634,16 +634,10 @@ class ParallelTransformerLayerPipe(ParallelTransformerLayer): 2) forward(input, **kwargs) -> output When the mask is static over all samples, it is advantageous to cache the mask and avoid communicating it. - - If no mask is provided, the module will query `self._args.attn_mask` - for the mask and only return `super().forward(...)` """ def forward(self, inputs, **kwargs): assert torch.is_tensor(inputs) or isinstance(inputs, tuple) if torch.is_tensor(inputs) or len(inputs) == 1: - # No attention mask forwarded, search for args.attn_mask - if not hasattr(self, '_args'): - self._args = get_args() hidden_states, attention_mask = inputs, None return super().forward(hidden_states, attention_mask, **kwargs) elif len(inputs) == 2: diff --git a/megatron/training.py b/megatron/training.py index c1033c997..2d45ca808 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1178,23 +1178,23 @@ def build_train_valid_test_data_iterators( # Build iterators. dl_type = args.dataloader_type - assert dl_type in ['single', 'cyclic', 'decoder_packed'] + assert dl_type in ['single', 'cyclic'] if train_dataloader is not None: - train_data_iterator = iter(train_dataloader) if dl_type in ['single', 'decoder_packed'] \ + train_data_iterator = iter(train_dataloader) if dl_type in ['single'] \ else iter(cyclic_iter(train_dataloader)) else: train_data_iterator = None if valid_dataloaders is not None: - valid_data_iterators = [iter(vdl) if dl_type in ['single', 'decoder_packed'] \ + valid_data_iterators = [iter(vdl) if dl_type in ['single'] \ else iter(cyclic_iter(valid_dataloaders)) for vdl in valid_dataloaders] else: valid_data_iterators = [None] * num_valid_ds if test_dataloaders is not None: - test_data_iterators = [iter(tdl) if dl_type in ['single', 'decoder_packed'] \ + test_data_iterators = [iter(tdl) if dl_type in ['single'] \ else iter(cyclic_iter(test_dataloaders)) for tdl in test_dataloaders] else: diff --git a/tests/data/gpt2/README.md b/tests/data/gpt2/README.md new file mode 100644 index 000000000..ad8eed839 --- /dev/null +++ b/tests/data/gpt2/README.md @@ -0,0 +1,3 @@ +Dataset used for testing. + +`ag_news_prompt*`: manually generated from dataset available at https://huggingface.co/datasets/TimeRobber/ag_news_classify_question_first_100 \ No newline at end of file diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 38caddd60..f1f2cfa38 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -118,10 +118,7 @@ def copy_data_to_temp(self, root_dir, prefix): src_folder = os.path.join(src_dirname, folder) dest_folder = os.path.join(dest_dirname, folder) if src_folder.startswith(src_path): - if os.path.isdir(folder): - shutil.copytree(src_folder, dest_folder) - else: - shutil.copy2(src_folder, dest_folder) + shutil.copytree(src_folder, dest_folder) return dest_path def test_mlm_dataset(self): @@ -173,7 +170,6 @@ def test_mtf_dataset(self): command_args = get_default_args() data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt") command_args["--data-path"] = data_path - command_args["--dataloader-type"] = "decoder_packed" with patch('sys.argv', flatten_arguments(command_args)): with mockenv_context(**self.dist_env_1_gpu): @@ -203,7 +199,6 @@ def test_decoder_packed_mtf_dataloader(self): command_args = get_default_args() data_path = self.copy_data_to_temp(self.data_dir, "gpt2/ag_news_prompt") command_args["--data-path"] = data_path - command_args["--dataloader-type"] = "decoder_packed" with patch('sys.argv', flatten_arguments(command_args)): with mockenv_context(**self.dist_env_1_gpu): @@ -233,12 +228,12 @@ def test_decoder_packed_mtf_dataloader(self): skip_warmup=(not args.mmap_warmup) ) - batch_sampler = build_pretraining_data_loader( + batch_iterator = build_pretraining_data_loader( train_ds, consumed_samples=0, num_workers=4 ) last_padding_size = 0 - for i, items in enumerate(batch_sampler): + for i, items in enumerate(batch_iterator): micro_batch_size, seq_length = items["decoder_token_ids"].shape # Check dtypes @@ -257,7 +252,7 @@ def test_decoder_packed_mtf_dataloader(self): # `segment_ids` is [1,2,...] self.assertEqual(segment_ids[:-1], list(range(1, len(segment_ids)))) # `0` signify that the tokens are padding - self.assertEqual(segment_ids[-1], 0) + self.assertIn(segment_ids[-1], [0, len(segment_ids) + 1]) original_samples_count += len([segment_id for segment_id in segment_ids if segment_id != 0]) # Test that we actually pack, ie we have more samples than the `batch_size` @@ -272,7 +267,7 @@ def test_decoder_packed_mtf_dataloader(self): last_padding_size = len([None for segment_id in items["decoder_segment_ids"][micro_batch_size - 1] if segment_id == 0]) - def test_finetune_t0_non_causal_decoder_get_bath_pipe(self): + def test_finetune_t0_non_causal_decoder_get_batch_pipe(self): command_args = get_default_args() command_args["--position-embedding-type"] = "alibi" diff --git a/tests/test_model.py b/tests/test_model.py index 3c90cede3..cf29acc48 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -397,14 +397,14 @@ def test_fused_masked_softmax(self): # mimick the same via torch output = scale * dummy_input - output = output.masked_fill(dummy_attention_mask, -10000) + output = output.masked_fill(dummy_attention_mask, torch.finfo(args.params_dtype).min) output = F.softmax(output, dim=-1) # Test that the nonzeros are the same with the mask for i in range(args.num_attention_heads): torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0])) # Issue is we use -10000 in mimicking instead of `inf` - torch_assert_close(fused_output, output) + torch_assert_equal(fused_output, output) def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self): @@ -460,10 +460,9 @@ def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_i output[change_batch_id, first_segment_first_batch_id_end:], output_changed_target[change_batch_id, first_segment_first_batch_id_end:] ) - # Check that values did not change in other segments of batch_id > 0 - torch_assert_equal(output[:change_batch_id:], output_changed_target[:change_batch_id]) - if change_batch_id + 1 < len(output): - torch_assert_equal(output[change_batch_id + 1:], output_changed_target[change_batch_id + 1:]) + # Check that values did not change in other segments in other batches + non_change_ids = torch.arange(output.shape[0]) != change_batch_id + torch_assert_equal(output[non_change_ids], output_changed_target[non_change_ids]) ## --------------- CHANGE A TARGET TOKEN --------------------------- # change the last token in the first batch to a pad diff --git a/tests/test_training.py b/tests/test_training.py index 08c06aa7a..0e41fd967 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -94,14 +94,11 @@ def copy_data_to_temp(self, root_dir, prefix): src_folder = os.path.join(src_dirname, folder) dest_folder = os.path.join(dest_dirname, folder) if src_folder.startswith(src_path): - if os.path.isdir(folder): - shutil.copytree(src_folder, dest_folder) - else: - shutil.copy2(src_folder, dest_folder) + shutil.copytree(src_folder, dest_folder) return dest_path def get_variation_config(self, variation, output_dir, n_samples=None): - data_dir = self.copy_data_to_tempf(self.data_dir,"gpt2") + data_dir = self.copy_data_to_temp(self.data_dir,"gpt2") pp_size, tp_size, dp_size = get_3d_dimensions() num_gpus = pp_size * tp_size * dp_size @@ -539,7 +536,6 @@ def test_training_t0(self): --save {output_dir}/checkpoints --load {output_dir}/checkpoints --data-path {data_path} - --dataloader-type decoder_packed --split 90,10,0 --tensorboard-dir {output_dir}/tensorboard --tensorboard-queue-size 5 From 985cd028010cc7e29cb833851249254d02ff8593 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sun, 3 Jul 2022 14:07:47 +0200 Subject: [PATCH 288/297] Revert back changes linked to shutil.copytree --- tests/test_dataloaders.py | 5 ++++- tests/test_training.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index f1f2cfa38..80ce8be4d 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -118,7 +118,10 @@ def copy_data_to_temp(self, root_dir, prefix): src_folder = os.path.join(src_dirname, folder) dest_folder = os.path.join(dest_dirname, folder) if src_folder.startswith(src_path): - shutil.copytree(src_folder, dest_folder) + if os.path.isdir(folder): + shutil.copytree(src_folder, dest_folder) + else: + shutil.copy2(src_folder, dest_folder) return dest_path def test_mlm_dataset(self): diff --git a/tests/test_training.py b/tests/test_training.py index 0e41fd967..c6a642ac8 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -94,7 +94,10 @@ def copy_data_to_temp(self, root_dir, prefix): src_folder = os.path.join(src_dirname, folder) dest_folder = os.path.join(dest_dirname, folder) if src_folder.startswith(src_path): - shutil.copytree(src_folder, dest_folder) + if os.path.isdir(folder): + shutil.copytree(src_folder, dest_folder) + else: + shutil.copy2(src_folder, dest_folder) return dest_path def get_variation_config(self, variation, output_dir, n_samples=None): From 41e931a97cfce39590729fa496497a1b7aadbe1e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sun, 3 Jul 2022 14:19:16 +0200 Subject: [PATCH 289/297] Get test working --- finetune_t0_non_causal_decoder.py | 2 +- pretrain_gpt.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 96899503d..0b649970f 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -98,7 +98,7 @@ def get_batch_pipe(data): if args.position_embedding_type not in [PositionEmbeddingType.alibi, PositionEmbeddingType.rotary]: raise NotImplementedError("absolute positional embeddings require us to reset position_ids accordingly.") - return (tokens, None, attention_mask), (labels, loss_mask) + return (tokens, position_ids, attention_mask), (labels, loss_mask) def train_valid_test_datasets_provider(train_val_test_num_samples): diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 0138f50e2..fdd4d28be 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -23,6 +23,7 @@ from megatron import get_tokenizer from megatron import mpu from megatron.data.gpt_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.enums import AttnMaskType from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices @@ -53,9 +54,13 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed: + # Hack @thomasw21 to get fused_softmax.forward_torch_softmax working + args.attn_mask = None + model = GPTModelPipe( num_tokentypes=0, - parallel_output=True + parallel_output=True, + attn_mask_type=AttnMaskType.causal ) # This is a hack to give us a reference to get_batch_pipe from within training.py # We need to call model.set_batch_fn after deepspeed.initialize From b321a3491384e7b744d974cd009b8bc94434b771 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sun, 3 Jul 2022 14:22:46 +0200 Subject: [PATCH 290/297] Woops --- tests/test_dataloaders.py | 2 +- tests/test_training.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py index 80ce8be4d..f84313dd6 100644 --- a/tests/test_dataloaders.py +++ b/tests/test_dataloaders.py @@ -118,7 +118,7 @@ def copy_data_to_temp(self, root_dir, prefix): src_folder = os.path.join(src_dirname, folder) dest_folder = os.path.join(dest_dirname, folder) if src_folder.startswith(src_path): - if os.path.isdir(folder): + if os.path.isdir(src_folder): shutil.copytree(src_folder, dest_folder) else: shutil.copy2(src_folder, dest_folder) diff --git a/tests/test_training.py b/tests/test_training.py index c6a642ac8..4ceb2aca3 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -94,7 +94,7 @@ def copy_data_to_temp(self, root_dir, prefix): src_folder = os.path.join(src_dirname, folder) dest_folder = os.path.join(dest_dirname, folder) if src_folder.startswith(src_path): - if os.path.isdir(folder): + if os.path.isdir(src_folder): shutil.copytree(src_folder, dest_folder) else: shutil.copy2(src_folder, dest_folder) From 0450bad84ac234340209f7fa2e59c773b0d7903f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sun, 3 Jul 2022 14:29:45 +0200 Subject: [PATCH 291/297] Woops --- megatron/data/decoder_packed_mtf_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index 85e41970b..acc1ad901 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -329,7 +329,7 @@ def pack_samples(self, items): decoder_is_inputs = [[1, 1, 0, 1, 1, 0, 0]]: `1` depicts inputs, `0` depicts target. """ - decoder_tokens = np.full((self.seq_length,), self.pad_token, dtype=torch.int64) + decoder_tokens = np.full((self.seq_length,), self.pad_token, dtype=np.int64) decoder_segment_ids = np.zeros((self.seq_length,), dtype=np.int64) decoder_is_inputs = np.full((self.seq_length,), False, dtype=bool) From de4934f530b1c216e956fbaeab882d9af71fe0c9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sun, 3 Jul 2022 14:38:44 +0200 Subject: [PATCH 292/297] Fix empty samples --- megatron/data/decoder_packed_mtf_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index acc1ad901..e3ed8a2ec 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -476,7 +476,8 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam row_length = row_length + tok_len if row_length > seq_length: # current sample can't be added and requires to be added in the next one - full_samples.append(np.asarray([current_sample_start, current_sample_end])) + if current_sample_end > current_sample_start: + full_samples.append(np.asarray([current_sample_start, current_sample_end])) current_sample_start = current_sample_end row_length = tok_len From e3e21f553314939d10c32b3c70f6913f530c7340 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sun, 3 Jul 2022 15:03:06 +0200 Subject: [PATCH 293/297] Cuda kernel is not strictly equivalent --- tests/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index cf29acc48..d0fc81f23 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -404,7 +404,7 @@ def test_fused_masked_softmax(self): for i in range(args.num_attention_heads): torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0])) # Issue is we use -10000 in mimicking instead of `inf` - torch_assert_equal(fused_output, output) + torch_assert_close(fused_output, output) def test_non_causal_decoder_model_with_packed_input_passed_with_attention_mask_is_not_causal_across_segments(self): From 16c556c09613f7fb5ae712217ed47e163ee8e667 Mon Sep 17 00:00:00 2001 From: Thomas Wang <24695242+thomasw21@users.noreply.github.com> Date: Mon, 4 Jul 2022 09:35:33 +0200 Subject: [PATCH 294/297] Update tests/test_model.py Co-authored-by: Niklas Muennighoff --- tests/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index d0fc81f23..3410ab54b 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -403,7 +403,7 @@ def test_fused_masked_softmax(self): # Test that the nonzeros are the same with the mask for i in range(args.num_attention_heads): torch_assert_equal(torch.nonzero(fused_output[:, i]), torch.nonzero(~dummy_attention_mask[:, 0])) - # Issue is we use -10000 in mimicking instead of `inf` + # Cuda kernel produces slightly different results torch_assert_close(fused_output, output) From f2df7715c7a287ab0346b4b14a47bdf1f083c7ad Mon Sep 17 00:00:00 2001 From: Thomas Wang <24695242+thomasw21@users.noreply.github.com> Date: Mon, 4 Jul 2022 10:41:15 +0200 Subject: [PATCH 295/297] MTF optimize dataloading (#298) - Remove unecessary code from MTFDataset - Create size API for MTF dataset - Use new size API to build packed index much faster --- megatron/data/decoder_packed_mtf_dataset.py | 15 +- megatron/data/indexed_dataset.py | 3 + megatron/data/mtf_dataset.py | 356 +----------------- .../gpt2/ag_news_prompt_inputs_document.bin | Bin 12038 -> 23780 bytes .../gpt2/ag_news_prompt_inputs_document.idx | Bin 2042 -> 2042 bytes .../gpt2/ag_news_prompt_targets_document.bin | Bin 12526 -> 976 bytes .../gpt2/ag_news_prompt_targets_document.idx | Bin 2042 -> 2042 bytes .../data/gpt2/generate_ag_news_mtf_dataset.sh | 22 ++ tests/test_dataloaders.py | 31 +- 9 files changed, 46 insertions(+), 381 deletions(-) create mode 100644 tests/data/gpt2/generate_ag_news_mtf_dataset.sh diff --git a/megatron/data/decoder_packed_mtf_dataset.py b/megatron/data/decoder_packed_mtf_dataset.py index e3ed8a2ec..f504d7f91 100644 --- a/megatron/data/decoder_packed_mtf_dataset.py +++ b/megatron/data/decoder_packed_mtf_dataset.py @@ -288,7 +288,7 @@ def __init__( eos_token: int, seed, ): - self.mtf_dataset = MTFDataset(name=name, data_prefix=data_prefix, data_impl=data_impl, skip_warmup=skip_warmup, documents=documents, seed=seed) + self.mtf_dataset = MTFDataset(name=name, data_prefix=data_prefix, data_impl=data_impl, skip_warmup=skip_warmup, documents=documents) self.pad_token = pad_token self.seq_length = seq_length @@ -362,7 +362,7 @@ def pack_samples(self, items): item_num += 1 cur_len += total_len - assert cur_len < self.seq_length + assert cur_len <= self.seq_length return { "decoder_token_ids": decoder_tokens, @@ -465,13 +465,15 @@ def _build_sample_idx(mtf_dataset, document_ids, seq_length, row_offset, old_sam full_samples = [] current_sample_start = old_sample_start - assert epoch * len(document_ids) >= current_sample_start + epoch_offset = epoch * len(document_ids) + + assert epoch_offset >= current_sample_start for current_sample_end, document_id in enumerate(document_ids): - current_sample_end = epoch * len(document_ids) + current_sample_end - sample = mtf_dataset[document_id] + current_sample_end = epoch_offset + current_sample_end + sample_sizes = mtf_dataset.size(document_id) # TODO @thomasw21 figure out if we add tokens - tok_len = len(sample["input_tokens"]) + len(sample["target_tokens"]) + tok_len = sample_sizes["input_tokens"] + sample_sizes["target_tokens"] row_length = row_length + tok_len if row_length > seq_length: @@ -511,6 +513,7 @@ def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_w return get_indexed_dataset_(f"{data_prefix}_{field}_document", data_impl, skip_warmup) + def get_indexed_dataset_(path, data_impl, skip_warmup): """Build indexed dataset.""" print_rank_0(' > building dataset index ...') diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index d92a0535b..d0d312544 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -573,6 +573,9 @@ def get(self, idx, offset=0, length=None): def sizes(self): return self._index.sizes + def size(self, index): + return self._index.sizes[index] + @property def doc_idx(self): return self._index.doc_idx diff --git a/megatron/data/mtf_dataset.py b/megatron/data/mtf_dataset.py index 99fc8c095..57f3a779b 100644 --- a/megatron/data/mtf_dataset.py +++ b/megatron/data/mtf_dataset.py @@ -15,242 +15,14 @@ """Multitask Finetune style dataset.""" -import os import time import numpy as np import torch -from megatron import mpu, print_rank_0 -from megatron.data.blendable_dataset import BlendableDataset -from megatron.data.dataset_utils import get_datasets_weights_and_num_samples -from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ +from megatron import print_rank_0 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - -def build_train_valid_test_datasets( - data_prefix, - data_impl, - splits_string, - train_valid_test_num_samples, - seed, - skip_warmup -): - """Build train, valid, and test datasets.""" - - # Single dataset. - if len(data_prefix) == 1: - all_train_datasets, all_valid_datasets, all_test_datasets = _build_train_valid_test_datasets( - data_prefix=data_prefix[0], - data_impl=data_impl, - splits_string=splits_string, - seed=seed, - skip_warmup=skip_warmup - ) - # Blending dataset. - else: - - output = get_datasets_weights_and_num_samples(data_prefix=data_prefix, train_valid_test_num_samples=train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - train_datasets = [] - valid_datasets = [] - test_datasets = [] - for i in range(len(prefixes)): - train_ds, valid_ds, test_ds = _build_train_valid_test_datasets( - data_prefix=prefixes[i], - data_impl=data_impl, - splits_string=splits_string, - seed=seed, - skip_warmup=skip_warmup - ) - if train_ds: - train_datasets.append(train_ds) - if valid_ds: - valid_datasets.append(valid_ds) - if test_ds: - test_datasets.append(test_ds) - - all_train_datasets = BlendableDataset(train_datasets, weights) \ - if train_datasets else None - all_valid_datasets = BlendableDataset(valid_datasets, weights) \ - if valid_datasets else None - all_test_datasets = BlendableDataset(test_datasets, weights) \ - if test_datasets else None - - return all_train_datasets, all_valid_datasets, all_test_datasets - - -def build_dataset_group( - dataset_group_name, - paths, - weights, - splits, - data_impl, - train_valid_test_num_samples, - seed, - skip_warmup, - train_valid_test -): - ''' - Build a single dataset group corresponding to Option 2 of data loading see arguments.py - a dataset group is passed in the following form - GIVEN_NAME WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT2 START:END PATH2 - or alternatively - GIVEN_NAME PATH1 # for a single dataset to be used fully - ''' - - assert train_valid_test in ["train","valid","test"] - - # Single dataset. - if len(paths) == 1: - dataset = _build_single_datasets( - data_prefix=paths[0], - range_string=splits[0], - data_impl=data_impl, - seed=seed, - skip_warmup=skip_warmup, - dataset_group_name=dataset_group_name, - train_valid_test=train_valid_test - ) - return dataset - # Blending dataset. - else: - - data_prefix = [] - # data_prefix is of the shape: - # ["WEIGHT1", "PATH1", "WEIGHT2", "PATH2", "WEIGHT3", "PATH3"] - for w,p in zip(weights, paths): - data_prefix += [w,p] - - output = get_datasets_weights_and_num_samples(data_prefix, - train_valid_test_num_samples) - prefixes, weights, datasets_train_valid_test_num_samples = output - - # Build individual datasets. - datasets = [] - for i in range(len(prefixes)): - ds = _build_single_datasets( - data_prefix=prefixes[i], - range_string=splits[i], - data_impl=data_impl, - seed=seed, - skip_warmup=skip_warmup, - dataset_group_name=dataset_group_name, - train_valid_test=train_valid_test - ) - - datasets.append(ds) - all_datasets = BlendableDataset(datasets, weights) - - return all_datasets - -def _build_single_datasets( - data_prefix, - range_string, - data_impl, - seed, - skip_warmup, - dataset_group_name, - train_valid_test -): - """Build a single dataset""" - - assert train_valid_test in ["train","valid","test"] - index = ["train","valid","test"].index(train_valid_test) - - # Target indexed dataset. - target_indexed_dataset = get_indexed_dataset( - data_prefix=data_prefix, - is_input=False, - data_impl=data_impl, - skip_warmup=skip_warmup - ) - - total_num_of_documents = target_indexed_dataset.sizes.shape[0] - # this corresponds to option2 for data loading on the form - # WEIGHT1 START:END PATH1, WEIGHT2 START:END PATH2, WEIGHT3 START:END PATH3 - # splits here is an array of size 2 [start_index, end_index] - splits = get_split_by_range_(range_string=range_string, size=total_num_of_documents) - - # Print stats about the splits. - print_rank_0(' > dataset split:') - - print_rank_0(' {}:'.format(dataset_group_name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[0], splits[1], - splits[1] - splits[0])) - - def build_dataset(name): - dataset = None - if splits[1] > splits[0]: - documents = np.arange(start=splits[0], stop=splits[1], - step=1, dtype=np.int32) - dataset = MTFDataset( - name=name, - data_prefix=data_prefix, - data_impl=data_impl, - skip_warmup=skip_warmup, - documents=documents, - seed=seed - ) - return dataset - - dataset = build_dataset(dataset_group_name) - - return dataset - - -def _build_train_valid_test_datasets( - data_prefix, - data_impl, - splits_string, - seed, - skip_warmup -): - """Build train, valid, and test datasets.""" - - # Target indexed dataset. - target_indexed_dataset = get_indexed_dataset(data_prefix, is_input=False, data_impl=data_impl, skip_warmup=skip_warmup) - - total_num_of_documents = target_indexed_dataset.sizes.shape[0] - # splits here is an array of size 4 [train_start_index, valid_start_index, test_start_index, test_end_index] - splits = get_train_valid_test_split_(splits_string, total_num_of_documents) - # Print stats about the splits. - print_rank_0(' > dataset split:') - - def print_split_stats(name, index): - print_rank_0(' {}:'.format(name)) - print_rank_0(' document indices in [{}, {}) total of {} ' - 'documents'.format(splits[index], splits[index + 1], - splits[index + 1] - splits[index])) - print_split_stats('train', 0) - print_split_stats('validation', 1) - print_split_stats('test', 2) - - def build_dataset(index, name): - dataset = None - if splits[index + 1] > splits[index]: - documents = np.arange(start=splits[index], stop=splits[index + 1], - step=1, dtype=np.int32) - dataset = MTFDataset( - name=name, - data_prefix=data_prefix, - data_impl=data_impl, - skip_warmup=skip_warmup, - documents=documents, - seed=seed - ) - return dataset - - train_dataset = build_dataset(0, 'train') - valid_dataset = build_dataset(1, 'valid') - test_dataset = build_dataset(2, 'test') - - return (train_dataset, valid_dataset, test_dataset) - - class MTFDataset(torch.utils.data.Dataset): def __init__( @@ -260,7 +32,6 @@ def __init__( data_impl, skip_warmup, documents, - seed, ): # Params to store. self.name = name @@ -275,131 +46,26 @@ def __init__( assert np.max(documents) < self.target_indexed_dataset.sizes.shape[0] assert self.input_indexed_dataset.sizes.shape[0] == self.target_indexed_dataset.sizes.shape[0] - # Build index mappings. - self.doc_idx, self.shuffle_idx = _build_index_mappings( - name=self.name, - data_prefix=data_prefix, - documents=documents, - seed=seed - ) - def __len__(self): - # -1 is due to data structure used to retieve the index: - # sample i --> [sample_idx[i], sample_idx[i+1]) - # return self.doc_idx.shape[0] - 1 - return len(self.doc_idx) + return len(self.input_indexed_dataset) def __getitem__(self, idx): - # Get the shuffled index. - idx = self.shuffle_idx[idx] - input_tokens = self.input_indexed_dataset.get(self.doc_idx[idx]) - target_tokens = self.target_indexed_dataset.get(self.doc_idx[idx]) + input_tokens = self.input_indexed_dataset.get(idx) + target_tokens = self.target_indexed_dataset.get(idx) assert len(input_tokens) > 0 assert len(target_tokens) > 0 return { - 'input_tokens': np.array(input_tokens, dtype=np.int64), - 'target_tokens': np.array(target_tokens, dtype=np.int64), + 'input_tokens': input_tokens, + 'target_tokens': target_tokens, } - -def _build_index_mappings( - name, - data_prefix, - documents, - seed, -): - """Build doc-idx, sample-idx, and shuffle-idx. - doc-idx: is an array (ordered) of documents to be used in training. - shuffle-idx: maps an index into a random index into sample-idx. - """ - # rng state - np_rng = np.random.RandomState(seed=seed) - - # Filename of the index mappings. - _filename = data_prefix - _filename += '_{}_indexmap'.format(name) - _filename += '_{}docs'.format(len(documents)) - _filename += '_{}s'.format(seed) - doc_idx_filename = _filename + '_mtf_doc_idx.npy' - shuffle_idx_filename = _filename + '_mtf_shuffle_idx.npy' - - # Build the indexed mapping if not exist. - if torch.distributed.get_rank() == 0: - if (not os.path.isfile(doc_idx_filename)) or \ - (not os.path.isfile(shuffle_idx_filename)): - - print_rank_0(' > WARNING: could not find index map files, building ' - 'the indices on rank 0 ...') - - # doc-idx. - start_time = time.time() - doc_idx = _build_doc_idx(documents, np_rng) - np.save(doc_idx_filename, doc_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save doc-idx mapping ' - '(seconds): {:4f}'.format(time.time() - start_time)) - shuffle_idx = _build_shuffle_idx(doc_idx.shape[0], doc_idx.shape[0], np_rng) - np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) - print_rank_0(' > elasped time to build and save shuffle-idx mapping' - ' (seconds): {:4f}'.format(time.time() - start_time)) - - # This should be a barrier but nccl barrier assumes - # device_index=rank which is not the case for model - # parallel case - counts = torch.cuda.LongTensor([1]) - torch.distributed.all_reduce(counts, group=mpu.get_data_parallel_group()) - torch.distributed.all_reduce(counts, group=mpu.get_pipeline_model_parallel_group()) - assert counts[0].item() == ( - torch.distributed.get_world_size() // - torch.distributed.get_world_size(group=mpu.get_tensor_model_parallel_group())) - - # Load mappings. - start_time = time.time() - print_rank_0(' > loading doc-idx mapping from {}'.format( - doc_idx_filename)) - doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' > loading shuffle-idx mapping from {}'.format( - shuffle_idx_filename)) - shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') - print_rank_0(' loaded indexed file in {:3.3f} seconds'.format( - time.time() - start_time)) - - return doc_idx, shuffle_idx - - -def _build_doc_idx(documents, np_rng): - """Build an array with length = number-of-epochs * number-of-dcuments. - Each index is mapped to a corresponding document.""" - num_epochs = 1 - doc_idx = np.mgrid[0:num_epochs, 0:len(documents)][1] - doc_idx[:] = documents - doc_idx = doc_idx.reshape(-1) - doc_idx = doc_idx.astype(np.int32) - np_rng.shuffle(doc_idx) - return doc_idx - - -def _build_shuffle_idx(num_samples, total_size, np_rng): - """Build the range [0, size) and shuffle.""" - print(' > building shuffle index with split [0, {}) and [{}, {}) ' - '...'.format(num_samples, num_samples, total_size), flush=True) - - dtype_ = np.uint32 - if total_size >= (np.iinfo(np.uint32).max - 1): - dtype_ = np.int64 - - shuffle_idx_first = np.arange(start=0, stop=num_samples, - step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_first) - if num_samples == total_size: - return shuffle_idx_first - - shuffle_idx_last = np.arange(start=num_samples, stop=total_size, - step=1, dtype=dtype_) - np_rng.shuffle(shuffle_idx_last) - - return np.concatenate((shuffle_idx_first, shuffle_idx_last)) + def size(self, index): + return { + 'input_tokens': self.input_indexed_dataset.size(index), + 'target_tokens': self.target_indexed_dataset.size(index), + } def get_indexed_dataset(data_prefix: str, is_input: bool, data_impl: str, skip_warmup: bool): if is_input: diff --git a/tests/data/gpt2/ag_news_prompt_inputs_document.bin b/tests/data/gpt2/ag_news_prompt_inputs_document.bin index b786d6e414d6c5784520a20da2f254eb9c3e7add..4a7f085deedafe2758c154f0f317258fe8af4869 100644 GIT binary patch literal 23780 zcmeI4dE8Cq`v0%BcFB;ELQ#1<<|p$|MCOcTj1VP5RAk6ZG9;Bk2N6ofGDn(KhDt=C zQz8mEWNwm9gMRPNx~v}cb-vF&zt`{2-yi$6*Iw(sult&(`(C?7akDaw%qm`OR^#7h zcbsQ-%iU&|rkeepFgwycut{0WE-|yif!VDC11r(ZtS!I4nP~RJJqhc@{rj7n4dDBp zj{>`pXRCNNCN%r9@W#+YThmAR}l;diq=tTBYOPL(!$g!!BKlqw$B zW4X-=Ft0z8G%xS@X1z9;)$eL{7te%GjSO(#S>5a)@aqBd>tmrkc_^@Eth>5sXq!g` z*7T0VIdRU_KCsPO%&ONit2-1ku<7`OF4iA6Nh$OOKh|%^dNo$(q2{4|14-Rc&0tD)L+Kf6Q!_n%%4w*HeL2 zqkvh#OwJq4Emku*`pYrDBjbh7hEGE4@`qW!ugpFvWLD+T@SL>ogt2RZf5lCq9em#G zLvVR^UKruEX@=uoX1>D|E>%te7x>5WCL$Rfu5Gr6`=^ft_B`KDL5Ife>+E>`6WsG1D~W}R2f&Z}qk_%OGHN$^c;)Vhx`SGlZz zD;gw4Z!BBMXd?7o!W!cHYVdz*I{4*v-5Slak=!o?Pr7~YWBOi!%_BqEYtPQdY49#kwl>6j|k;&7Hk&hWvC9pmVk>PH54&E`nd2^bbED=UH?gZx?AEx`YlC#{t zrHk=+@n`RRAKx+!o8;N!54sJA#~rQ;?O&78Mf{)a;6`GO=+B7mJ!uL19bQ#Mmqy?K z$^%ROChHSHqzC;{%&vp}V|O~dD$pQ)O85SLUp%SKr^7>L6|jvo_)`QtCZ3Vsp~6mM zC**iBFr@#|@&4GFbY*@O{5(A0u-f_65Sx&n(^%11u4X!Iy7&2IL3CesIFR{=auRdE zPr8zaPnjZSM;C{7C%BG5uan`&YY&Guw<@t19AyWIE5Qv}YHm#Xi>uRlA)1G_HY42H zr=@-EW>5Qg>80i=UR{pO)q@`4eY8ek*@gyDT=`~SXzw#$vTl!k%1%VUA0)slrs}?Y-!(hUuEuUQeh2N0J6m7wgl4VR`~t^g*NQOmvAQRi1MY%O zWU~i>Rzb$n&ntQO>==9s&!^mOHsdm4T{CQvwXY?YSW_;Dk4SgOz?U}OJ6Z{bmGt%S# zCdU$w@*WIq9gCcQo$Gm=b4i}b=39c(EzRx!|5d(K4w(O2zL62`|0(y-JjqYGl}zrT z@he9Ece#h=bRkCk-{c;jqN~sM_B?RIyTrQ!ZW9Z^SGHTVrCGyqX1mbGw$&1mp0~>! z*fSf=vZI@_)w5+7!x)H+wj zMo6<~`P4x-WgjzHNBWqL&zzgw-o9K!ZjQaSW84Giy>Pja=MMl&zEXCnTu`=g96FS@ z$M}sO9z_0Hyi?wyF~za__SleOrE;D$eB!f<6G_=YzD~pvU_6R16%V^&XZ>;oQM|83 z?luvhm=8L>t?9Tgf8c-eZZY|KJU`CgvO`-V;zBL(5w3~u_#GUa{KN&1d#)7cyP_ec zZ_Bklen5#JT65P4@>Srx&Ae0ii#TUHeZcvek)8iB*6gZ1q0M3bVsePi&>z>| zNxs(4;9B};v&#yD^H;xAREMElLUNX@UM8;QJsMgW;`@AJ z_2!y^%`V`+QEO-8Bc7b@iVpDoZLaj(NI9)+t|RhTUxvBWRo4ZP-wT1AsN-`5XXyIMf_=!h-xY2e4RLYMi-FYB*q3;I)0rH2!`t}q&B^g(G`^!! z0gpwBor#g;{lLlxeZ{fg;jG`BTtVNjVDERJr&?<&#$KP_@9rd4fYV`Ya}W5f#GWpO zmLkO8ZNU3@RwDATnM=CNl>=@>XT~%4t@g>`$nYp{8TO>>qxu8A8Ur63zASDgnY5mf zu;&o4{L0z|p8H6DK7coIjr(u-EWRrD(tPy~G`|9Nt6DizZR%2(-`ZOKM?ar|6- z-U(g3@v(2CGm97}TxBPBK?9F%0QktB5w8tAo?a(4o?!Otn$UKW45-!;UoN{T9foRz z^JgT7^Bi)VAK^>$Rm3~KTObSN+fO}3o)28r`9lW^E$Stye*{zc_1_jp77LI*?1}Zdo6NA<@cg7BcB!f_PWs3U^qVg3veTt z-r3rDO9-**Ekhd(uNS@NJrntB`Hk2wYyFp2kSpXzpI-~3zKiM^*?ZbZm$iILu?rHB z+~RdMS$QwE1FX}<(rZH1(8=)d*wf5GXH@g3mX~gpyN#InhM9Cc z{vAKpI&sah1-xD_Ol`s(<^CFb_e%H2gH{Aq61d6WfN4g3$V(1*9IUaOnr9+8SWV>D zh@3xLXtzA1CHIG=Z(=flPXC(+AwZX~uUKRaX>wsWP|t2vQVQE=V~ zf9Ia^T2=X3+^17tBMWer6iBxR)wzAwf)BVihQMU-np!omvA0lPl_AGv&DN#i1$+}v zbYFTO^W}%r>Ccs?N`^&_fqNpb>?oV~`xP)^{z|qQQw<-cXGQKx*ALOJ8X*}RWwXki z-XTv=excmpeQ4D^@$I!6y~igzmCg(HYIv$#Xk#mU5o2SYb7@_4K9l12I;k-lSJ zsh){^q~Kg$ANimQ@tVot+WC6NNBK-6*3sPCpYUG&HJKB33w~1h@nHC;9=C8;?yWrM zY5I*L!F_iXv%*i&pZW&df6L?liF{6%>hd=IyeCl>A0z&E%WU>ocF*C*m3Ljb|I_5` zsMq4a3bN*2V#kU;^tymAInSF%{)7&QzvAbbQ9<S!?W?AS->2{ zj_Dr~|Ig@Y8RFg_dy>Nkky(Bb39O$^I6dkSR;!C&nVztP@FZnax}E&v#V{IIf3w>} zrMn#8F5q2~I8g+=AH+tcLZ846ME2L<2ZgI(sy8D#W1Wa|tz=}}1z-Cq15=2>g4gS6 z`uXUx>cRIm(uKSYbxRNnpJ8t)hPuJ85me}jADk;oo#sCMf2BKg?{_mAyg3T$yZYPxR3HlDFPYDf79FZC2}{5g!~bnnVN z*8PAnqEWu11@gI?@B7y$XM=H$c@?m4$L19Wl^Yj>&vi???kOiOdOdc|vv)vZ|97Skx0(&YsB#x=iUwQ_95#eVq5vwI1mvr{?f#+DZqHp>i+ z(*JGXB{&tX^ca;V;lF3V6YVc2Bez1pPJEKEN^QbOx65r#*liVq=-&SN-zfx;M?& z$uZjJJTk+d*GJcXJLGY_=1|XhF2QeuVpM0UW?jDB>|am2{S_v6s0iP-T!G$qaeEOA z(O)e)w)TwoYfB@G7W6q~*LpUc@9taKJ3{vmm3@qV&xNlLU&Rmg4Mj&>yz-SdMi0m z5AXYH%r)1B)(8LEwq{`Jw@!pFo4{=ue39KB>_xo@oyzf*`|0_+dBZ5ih{mo(=?CX` zT~$8*7PcV1ya7Vf3cC(z&S?15ue-w(9xE~^<`(&KQw`YpLGiYq_I<}VE&n;Y@g+08n^l+H3K3GL`t~R=-d5;pqVXbW}?|5#3 zpC{2R?d?dXCLi?pqw9A7e8qkEeHevH{nKyfY(h=JnsFaZxJCO*Eu4Rn$qii+QI2>l zYhYIaOLPflK4R$7o!CA-%$<48*wMRTq~F!<_ur3k|Iv=n$hYKos&ru2-RwBXm+Fkm zHtr8v6-F@ImU8=yY3l|b|G?R2B!kBvMqZ`qD}e8*mqNpBM(fR>)?_(Tk9*xp=tgf8 zn=1BpI}{!9IRQOuRU;Ag-lgYy_RtpRRg6#fQ@)?yaz4+ChE_5a8o*zAaxeL>G$3;1s_TCgTrPs1W^=g%0Yd=71WWV^gp`~Io$upg#&R4-`b`Cd0kN9K2Wc z>@bAGwO7$cES_#JyN38P^`_%`Ic1vbjB2Wg7VxY04{F(F^et1}H;KM@pFJ6#^(yKx zlYhY0R^jum7K)|MP5MA^0Qxx#a4=N`LKjp5*&F4`kEX zk=f_SbQ#yOs|w@Mt(Nc{8Cbo|t7eh-_? zLBCZveR?dg@8%H4N)X#`q3+_I#>-F0e@|H8>xw7i@#&hUJ^Hu@sC@wWR`vF!liJ^v z4XB=1Y*fErzg1%gXDYYuW-~}!Z7ltw!qfeP2E!*EZH@*C2N7tc)`B? z5dQ1T9LYjGKgpsBb0iOq*W9M7oWJ@l-sWRm_nW=fEgO0NLwpB3)>!!l?d2$Em=btz zc2J2h@~7qYu*cEN`(Vk?rn4-=FCoqXS9=xHNfzgBB!(6bEjRWnoYeys4vN{r`Hzns zhgKcP74T`dEcZT+dfhr3qFS!+8Ck}9bl}0jp1UB7d}SZ#D!V$gcu%ZcP4c^i;IYzq zqWs{)j>+}I+P5R;Xd2k3Fop(_t2Y*L=knv;i0U*@OSCT#04c!G}YWzF}W$5xsDc=m*4LmwU{4}Itd@`0@Q{RZAYmdup1 zD<9K7VkdZ{=Q{hM_^19?Dq}K=wascEe_(ux&Pz`uFU^;}EH6x64gZzv|IGJ2+h-PJ6GnNV)*g`8{YkU>e(p(jnW=mb zo$(XxU%l)#;*K#1OECAXCdh{fOCcYVtgDZtmPc=AUlZCae$U+GGSRud#AILVhnnaY zI`P^TKfA2DTzaOpb;kFNXERhu!%iks@;JcL5-b!xuF19V7so16S_}ceA-rUb=WkmZAz>x13?y|Rwk==lw z1N)=0%UHhb3usM+$BG9UclR##*7z-3m(4H0KV;+n*=OsN!`8=lHhngT{AxycNEeh7 zY@6XdlaGeFF1)gaI^;L^0W%x8zD{dOJlv388*Fb%uE=`%Yj_VrdLduX03E)0Fn*&8 z{wq@=dQU;mzD75_+#bA6wuPEXz%Id^ZSM0aeeN$Bd|-0 z``n?_8!usxaSvPY71zXX?N#+g-*hcHr3;$>VvfAFpMP}?!~_YUsw zUM@#m0jIv41M7q#7dk&-(|A4)8Oa|BU+Mo+e9bo2Rt_Yb^KpO5J>&@u|61Sa9P>>4 z-MS6&Zu(t}B)dXIM9*7q+htY#acJdd0 ztRg3&uch}Tgx~eIuq}jL-G?tc*2C{9R%u>meBDRr*Lr@-50}Jterkd~&3C+Y?q$P3 zr&aiFy4UBYO+hCL`nC4Bw9bmhJbyg^9Ja{+~o{(}o_4;lk-pY2i zKM>f^X>JPvdM%#T8k=rczqNCCI?Ge6USL(ARp*R`H$o54iM-fiF?f>&9njp|m8f%T za`vCTSx@LcSb=kW=-RTXo(ud8j^84ugW!4BGv2$CkD5cQ$=8&#Z(wq0x)bANnV`!UGiOg&$k zWMyqh^kh7Jj9KJ@j2FL!%dA@iYY2T4E=ff4)EtJ}-oyU<6JkT`n__QSIaYh` z->J{j1Yoh>I(vPcL2iP4A7f18Y}DJ}r`%M$5e`qHH|wcXW4zZWxqMx5tV4O$Z;qX>1^!n-#ILJ>slIUKv7wE< zk~3f|lDPzX#|`KgevsZ*?4O@pzZ3d_{alGo$!BgwX2Mamh~%c4So?@E|2}{Q@klgj zJ@HfiQ#71f7e=~rCrvfk`EdAEp$NSknx8tSr<^IqM%=A6~v= zD)Ny(nT1{U!T#EyM;ZBsE?>Fb|57EieeZ;JK5%5;$`?h)5NxD4>-B|Kv7D8|$XC?8 zly^$-DXMkTvSSyPh~kOxQyfs7t3VzQ#}~b4q-UyqGY@C~3iu_k7xC$r1r8(S3D#kK z$|`GrU1*h;vge6y=4gq{p|6^sg=VL8cnNW+7vJ5PAH&ddJ?qiK?``Avr`(iWX0v_@ z3K-KdocoenF5qg;=dANFxD8~m>Z2WxCufJ^ar7P|u*92kueyCpzQRlSPb%xj=OdQm zgT&vai~YG|b~dkPk=qK#O284{WtYm=V!jLRXZRA)r1w0eSHJaky5jSSFKj0F!9NN1 z`*odf|AyD{RpOucB%X`zn74wra*Wd+^W@DzJ|?aiYd=KIf$nOo&cDZWtg7hv2`}L~ z3j1inbu16z{-?e$HzU0~hCWHBC!i!pFA9v5#};*9ZwkJNr^-cJ6><2gf3%OO@f&!i z^T?{}mY@TwoAMyjw@ac&Tvu!6`OtUM;dvHczm)$=*)!JY4%#%I(CPsYUTe1=C~81Mp?QDPlwW4@1iZQzdu@!46=KgT&tLDQ!iz1pfyWs;5`Lc{7wN9#QK>Y(6DI0RZq+=Mdj|MG{nlC7WA}KDs{Li11HAoSuL%wvcYhy`myO5{ z)Q{NP8eZ^Q_>>*uel?~=cB08H)u;YOR~kJMST+247Vb&cGt$?> zjOmYZEa1${ZSd^!b}rA3$V54y&UG9Hmd5;$JNR>7UcNIN=6gZ+)e7u)4yW9VKd*Id z`rf^Ibt}PFb|u)VgHO&2tuFRA0p5z=b$$x$1AIY^^6Bfw>uJ1j*i^#tNQRyVZX%y6 z;JxzU7yI7$#nd|2&BShyYy4~*bSn?g8LoH0qt^xG(RZ;PxbAs^GadOHKkZfj^A-Hg z85rs%p6V9-85<_IaX<6j_2ksmT()Pg)8ePljvWf@%PN7rUd3g1s(&Jaw-=ts-nCbA zE%sEFfFRlK80Oz^j7raaw9a$*6FrxG7BTE4+ZyR(l_a} z&Ufx?%6Xeb344V;s_0Ps);`+b*H{7Hy7Z@3gw6rLy<;f+0uIYWFp`;5G>`SN%l)+js|@4BJDs1{krb^eQZYdI0#xV971oC9OOhQ_01md{T=~< zW}J)F`GBp}{8?mA+}9ac`EBi~OMlZwIt~qn<72-dw&Y3gUks%VmfZ?@gqEJP}c1@hM_az zLwo;~KJh`g^+R@gH|TT$dhXb!_H$ZX?=_Zs5Sj2r>O-`8#`&S%`#yLj|EBeZBMbFL zlA#mwtj8kc!}}Y`?|ub-+I=p!g}`ctUs7&0={EKWTC%SL9?CDZPMxNK^#@OlS6(!* zo9jq{9_T=))YKFQt;^n!8(PjXhSk_^o{Gb}x-dla3S2fsey49hvr3hSJ& zdi$~8c%)1aop+4+d|PkV&9mugH;%r|Nb>Jio*TvIzKZ`!oNf@D6ZeVWQbP7Q-}!d` z*S=SrT95Pf*!?f%h~uw^=bZcB(2Yp!<)MMiWJ9f8Q+`~-zHh1Y@AzC~{hK@&kPKB* zZGx|d$m8UvEEBDqps*5fNEp1d_Z`5-7^sYvLFP8nDx604^jmU{i;X`cG`oE6& zeE`vRaap&C{lvzy@HX}f2XZB%Gg0z8(h0rOlFImu{10kpOIcI8q4$yAtn2je!|rvj z1#p&E2y94CKN}O*PlSGJ!I+2m);rj79;bKT6+2W{YOHczo#~R#mA@=LkKC6%$ic7w zwbqiID{pOvu4ul_t_UyDTm?9~H;{We&#|r@_Bz;~t>l4d*8F(iMAt>}C!$|+yaL@R zMlUkoZ9XR>o@^XWzJvY*z|p$6(CD0JTYN={Y5sfn9`ufWx8yViL;V>Xrok)4%J^O4 zRTI6x^!W#@3$GSJe|=!RO#G65t42^?i&WUM^83qHf$bsI3I7k~yI$-167)p0teTGQ z?sL7?p2ehP^q`n8`gJ}>XLK84!!ho|2ZdHTFYkIWH%9}H6|yt+JVck)C7f9TA6|Zu z#|^Fb=tJPL-{rSyf$R0xwcMAbc7PYiR`{30Hwu<~+zliL!b9@Ph`+B^@Hywh*h26? zwNxGQLA|FqkF{6hH^$P(I~OlyPs-61)0JoFtfT1IOuqT$d{s|MEyXFJYKV zgph=av3&H> zyZW?9XsLwWsIQl2P0<7G0O47o#3KH;;$h2~H#3renG!TjBXE z-?X11nJXWV9_SsT^)C~57W;aew*}F?BUQ1<%ATu#IK%&sX~8f*@7Qt;`itC0&2(5x z&_VSK6!QdA@2pA2OJQ;=?#X{@{YP$3jwdx4PfBW~&!fJ`71%0x66@L11oN~hA$+{e2DRv%!TjPQQ!;yqGiYHzUQjkDkpj< zzb^U9=2XXM%*-#?CnCYm&9&~UN2vG6R4>Ff*ebX4=)KbP?YjHCtJ={&8%K|wwe&v+ zvsStM)JJQAvPxewvdiQ6NWF_XhB+tEb;U2~P1)O=uAOzj;{kdaJo~$kQ_O7xp`t-~ zWb0daM-^Wteuxh7VmoJi^`Bevnbre1tfjt)&V;W|k^8Agce> zQ+NO!%rzqL^QNq+|1-*h?o>+idR_T-?NVNI6yc1De2wy8^&sL}Pk70{D0i-to&SdE zl^l)hd+<)dYVvCMt9onQ@pH#7(G|Z>ubg{RbLaKqneJP1@>_h}RfsqP4GDOC5*|r5 zdXJ^+Cf~Erf6Sp`ru~1}xb}{v3#;&5$;f0i>-^f4GhP>Q9$|WDKTJ*7O;3e3AwRt* zf`wqLAc3Pb6J28lQ4FbAlXvs#xKE$e!LPON^I1)g&#%>IURIBZ+8fTuPTr#r^p9qN zbtew|3f;H|oAGTB;nJJ==ZafpfAkn49Hs9Cn?m<2;wp*aNe@?=HL38F$&FuH$)vuiWh6mM)`h z@M38`pZjZ0=Y!<4uN(aUd{ON0X6#E&PLHtPDx5m*3hlek1N#A6{0(6QW1+K|^QS2M z9)dn#CQ*#iwPbP$d4OtrojobggqRG&WJB_&nO>r=MO@JPHMJ@HXQa68UquolTN7_H zl3f+-O?s(!(sZ6ndz<>tE&1d4JhbHZ)ePbjdMo^1F7DUD@9(y!+D)+3U-&z`)5X$Z zO3x*OANTp1>QPHB4;J@*GR0T%Iobc->Qck^>BQ%|<|1cc)rFo!%`mEwuf!*|gC^1P zcRzLZ88po=_8$Hb{HNfL+LXfQB7<4vQ42SCy}f>$$8f#tqx~bDQPVq-RjHe$Cl_?` z+)wh5zZHLLp{IJ*;&a!}(?@+z{%rpyAFi5UHF=PDQ*svaJMC*7K<|?EW5X|`zlfY# z-WyujfgAyv#P94g!sx&0&cy?rS&ZY%d~ncwopshkU8_e~2VPu1oBhKr&`JE)eVy63 zZmrW6+)OW~y#N2E&hY13=CtZP8_D+Fb@XF4I&A5d&P6D%5s$P-F8)Zb)$fh~Exq1y?@|{7Tko-+iyw0sr}Mur zwDI$#CHA=-^JAxvEl=uocxGrKuC;BebU8Vbr?cWC^1+ zmFsEGCa!Pv4zccS9-mHot(~EbD<4=U>`k^TKQ5l{NlnhSJC5WXL*@)9z}bW4&IkSP zpmK^S*r@RQPqdeui_PPcufPvKL7bL9k#0&abEXj=u-B~h@MA??KXg`YYHi2wo+2o76A{OSCvJEAAE9QJ9#%&0&j+r0bPMQL-`~gS zQogVM-uZlz|DRO8VV+;le$`{qP;!v5i}CxwpmiI`PXB8pUalCBi2PE{Jgf`c+*7C- z$oG<=8MLBtgRUmlFTht7^_Z(Zi)h_}?F?TK{?R@dXZFb@MMBU7kDNVE=uH`+Vgz`|+RZOX{qp z@_FfOe8zlGZ+aS2@qHn+`3}xSJ%g`IK|ZW0yO#b}XZ#NO#+nBh(>s81%vFt%43CP^ z!_xWMCtgE-TRbm|&%EhBf_3KSG_;N3)1QSD%d)4OyZ%#M-T_3$@~6o9y>- zcq@G0CfCq;hO=RI0Iv8qsTO+#O`O1cTToG)^m5gMs+DDRXMCU)y z_1I2&H1j%4&n7ZYG5fw79EN%*Su1)B6&}gROy`VcZ{p9D_}x_2i_ad;e4IUg_*oEt zqqDagscT-WdTv_uU!pxLy3Q~qf_Hkq>tsh{B!47YzD0iOvHrN!--~$ogyUI!m9L#q zY{`!v$2Rs)oU`nD5ucVHChS{cM~>?g(SMl1{&VtNeQXhYwr&friw?j;5MOXTb fE!oLNem~3@nwELMSf80DXz))i{}3W<{d~q literal 12038 zcmd6Nd3=m#*Z*~{dzs~)d-j=RlF3Xm*^x~WqLWMrNsT0yh}aSlNr;pXJ3&xEP*klg znxd9c<)N*mDAigXEhV}T+P<}vwogl|8|6J$^7hsD_e_4D&->r|htJ5&eP8EX=Q_*x ze9t8-{rvCoK_NR~ySPH_ARmy~{gqTE8ELB~g`1;#ir`k4Ls5c~d>3q{10yx0zkL)) zWU1+loCh!Oc9V3S19o=OkbPV(8Ocec4gw9|>+a(ky-&#hw)(+{_7DzN#_`2R;KYE- z+n%ibvpd*ySGQzwIK-PI;wGz%jR4>d@j|6ImeY_qoz38a2T^YFl(yH~SItfkVUJcP zNu&{8PS6|OWEK}OE*)Our&9<1Hc8=IgQ}e_Nb9~lQb~W)h-BHsCmPHY4v7g*?KkYR zEbi_mTpJM%YhAcfL$(*xadj|FvV)u4nq5srIHaGX2oMBeA}_+C04C2(K2M)dFF-Fi zu-mufn^SddmvI-`y(etYZhJYPV_uE_5i$J3}v8+~oIYwM`;6_)$71nrJ#?>U>MB+PUA5cEk=>K|`KZZ5Abx zT3k<0JZA^3a9CRo(cP4!(PgH|@E-II{+jzy+Ha6Z!JuA32ZIaV=Ozl<#!F<40-&B= zmQFmE9Wt1IPEo~)`2Jqw*g)S>f9HSGtT5MUBr=)%u;&8*RWL_I2e9hc)HGuyBStjKWVgFN-Ni0$N^t{a#=DLsao$&35TYlxRR#K7;vYD1Pu(SJ6lLwpq z%f(3weR7C-iKYSmlB_2(v5Id>%-g2Rf?)seU;Ib~O8Z@RxV=O~c&^VGtKMv)iCW+G zu)kWXs^>n=tfu3l%VDZCU0AB!ZFqt3mOrAvEOJJVrf&9oV7%^+!eYs}l3%<2T-7``yhhd7sgcix~ z4q%^VdSAIO8Q{e*8LcT3CpA3D? zu`~BESR!Qf5{cA6kxn92Fud&m{u|RolvlYg2EG@zKm|Roh^FKD(WtMltn%+UtBEX3 z_+xa7$N9=zO$7A+o4ob^W{v;3{@*i46LDdF`1j1wMD9C%XO1Q^Soi>` zd#~;dPzf^eBR7R6@>Ay1w16*%E@a6>C2>H)QxdtZ%R@PzJ598SWa3$jwq?P7!x`#T zKJfasygOAxsx`GzxN}leo#usM8WKm|q5sNkBFErbl zhmPa99W%c!%@R9B?4Ix$+6drb)E4*@ZPp&+!jK-<)wdp@WM`r}D;#TEI$TJc9sbPR z|LGsKu+2qacH7h4nVcqaoE8W(4FH?Pzt(QjIAbw4sjYGo8EFjbnvULoSeplDgIuth z&QsfA2EMmc(-rq>RfsTAcRI+oE(zol?TdOF^ssh>4A&iQAWX-9x9IPaB1_w8KrFWy=qqDG^HPUaH})LlB># z_6Vc#L?fIJVUeR4EUoy^REgRChVWAlX8&5OYV7Yu@OZa0`^RQ+k<1Z-x!#X#2xGNr zojfthKWl+XBHb0tgQm+HExz}5e$X*$8%B$5m5oc^=BouP?P!adZr!EXVc z9r+e!%s!Sa5Tn zSA;YhU?p%ul%ESO`rk14UO`;*yDJ^O^RMl6M0H64aWiyvP`BO=k5QtTG~h~~speU( z={Z>LSlgKf(PGIl(UYB;9Z?8O*4j%<=7ymVBDdz8@*dw^QS9zR< zbnUFtkS@i?A|-MOec-Ncc7X_MRc75+P=i(A5n8HisPA>#k-m}^UZmgezN@=wtJDot zFk~eseZRX`dzQ_H>rO;h1v(&L-)rcdg7CCKt9`4R3oa>ap2}9RYn`GS%x(WL{=VN9 zaF!0pz2p$_lSNIf(+=|lFZ`)EP6axesuc`%^m1;2e_@6y*xNOTU*m+Ct`~EBYdfX$ z4#Qnthc|xltA$zOA(aS2O*;%~;|(P{vG_L2tsbz$9>r=}Fvbg;w1Zr}*O|Fn5AV%! zVz;q8E~a#Vv3N8)p}es7=G?M+VW0uK2hA5^i7P-+V@3_ZWwFEZxQ|gLvkA*^ZDR`&9WV?8EoLz48^Ks#7h2CCO`q^;UA@C=&_T~4lK`) z2v@(pzm>SkUPd%qZSP6UWAvEQ(xcK~oA8{(BI@mNd2lZ?YK#i}QPO?=U(b}oX9sc+ zS)<XMKw*C0W|Ine4a%IrzsYxxD;HU#HU{5 zwu|-n8Liwd`rchIF%7)kBM+ucTNh~>c(3nP?IUo4Lo<(e!9n|HLLA2Pul78s(61Kv zM@ZzQz+1{KP(3Jz)F@rx4LC?$T03ZF;?qzydkl76E{Ib9j5U1@R^x9=R*Tn^#cncJ50l7ntd7_)GiQqmtRb^>`7(e$?JR^VAK`E1x;~zX zsP*SjZNQ`w-@)JDS(h zCfe+fjB)R^u=RAU#P@XTJ?<3tZN1hw;4Sqrt%5s?9r^$@=Ajsgl=wG5c>g7~%3kHL zGSL6oj?l?=87r{6Fn}yo$)vxK%GKZr#AGrrY(x)~wC6f9e=kFu)-_|j0qT!oTn5|8 z+88fP9LTbpKtr#JQ}CQdd!*(_>gT|24F1WpomA?sV$B^-N9(d6A0}el=3qDcDSG0MZr|}! z^u{c48yI1kVl6&3HPD85KMj5nvdQn68sgGi#VGZFT)j-OOVJ70jVfUR^!{<2jlplxj;tAB8h?aYpfebG7^o8J*(h zN`C1bR?nQUo*!y27cGc0!%A4y^u7P2(4qDgy#mOKCO_@Z(^=l$$@L)rcXSL!%e>?n z63D8zpwEL3W8Z}|SncHb{cRp)k43IPgk6f0$RJMR9#apP zDZ`Ctuv76Mw{roCVTa+Eb~G)uzYR)54=tr-iUDsi-l9t(=NydQJo^BX zSx-;uf{hORRwDQOT(nDJ9yGLFOdljWVX$p5qJ0P<#a53#si91j?*|q=t<%~B<88dd2=iHETgEW zygTrpPn`?A!_D|b*-v8k?t1HaFLT;(TdSKs@nQ$*)p##KDL6(*|hD*;}! z4oAO=3dw`rnO?qAln1-h5+TLRy0#$fj>pnPu`c|-m(N0MmA(=4c$9{eguEJ|#ePrwTKznHj_8XSd-*Co&ch8Z$Zs@CWNbRW<*iaNvrtoDr(x*C1G|2sd-D#EuM;aivS=>)X)!;AQ@Ka9UpneQa6n zSj9-Qm%>yhtA5Nsn#i`0vB5)Bok>uL2y?YnI(tw9OjICZYQU#yFS(jqpz0brnwB9# z$)+D-H^$y-S@lmE(af@5hd*gP=&;%MrOmvkXH*WzkpdGXGSu?E$wp#h)8W$zYl}Nz zEoDCYSeiwdMn)y8wR{qb^DKA=J;Ym42aVC4c_)nLb}D|#lrSz?*LIs{nD#7O;-XS8 zegje|tLlsVFTxV6)B?U;HV?oWy;696>=Z>h{+1~oRI^?;3+91#IA~gG<45`T%wK3d z$m;+MLabs+tVu`K)7lF$QMKW*q#-L6XY%5y@11SV*=&6vvn;b#^+nJ(e1<+yTbSJw zy`YDc{UYNp4Z=D@6S+`egvh`M^uXf^GseLNtZ^djM`ThX#bV_>5^BZS5Sy9sb~C*S zk%+}utgSO-K?HRO#gYSJ!I))5>-)0tK<&%I=RGQPf50NK7P8g33L8mvC{jmYK1m9= zp&G^2V>Qj@YGHQ3ApR%S1}GArw}hFnALIsicEHPc68=M*V)2AxlO6M!KNd%1$i_-%xQYM(h(`GIg~H=-HXXjre<9oJt+- z(2!*?XV^v9Vs=4qE^@-noXLCx_6oD(*IATWkuYoYk5M27`n=?_m@Kor8Gya?j(LOE z_}=SS=b^SQCx1fMqkgmXwJs}!npZ^U5vJ`CSNBl+bjMb z>vSIUHmmMj{Oy|cZX06g`r-9T9p49f=XdSLH27!zbmuT}Ah*c>3U7y%$Y-Qs@5z4I zhaD)u?HWF5iXu&%n}KtT%q-0ReR)lqO>|MVhTnEPJN@JdF6{oULOQ;6o?E24>@$mf z(mvJU-Tbk0zr7ag=>~bh@l_VK<9mFnBR6|jQ6ns+b#bYf-LEv)i^41xIbQU~Vkn_C zspU(Zkdm>K?!-x3+`29=hDD4C#hSFh1!cl!bt-mODH+TbgJC!FDotcox=gYKBi2=% z2BCkSlw1%bQ2HCSQqoHGla+PN8qG3#Aq}~(G`oW^)?*{Brwxb}j z@bL0MAfXKw7;$Q#8AAIanre~Zf?|E<8zS<#(Oe}z3MwgPJpbqTA9xlcuPQ_l5(dz{rx2}-4HP3 z9-<)^EM7dDXMO1_+Ef~?OtYvX9f%JmmHh|WSE!KF;P;E$gt=kCbZ^d68j*kxQH1;fUcZcJ3Q^_EsKd(p{=;tsuemofGvv$hfRRaaH^nGy8uN z*$IMh(Y^~l6KtUJUx9UJHN6$fo*1l*cCv{00a4;VLanig3Y*`lPkUo~uIn0Na92M3meUC%r$I79yYOf&&3dV@iYpY8N;I3WQ4SUol>YIfU`vE$eAo zLCHxtFKI*A#6|J+*YlRc9{7>fTz-zgIum~qLdM(8-c{gtphT!`p0Kn$9j8jxoi>(3 z4DmQ|E)8Gz$5Tmad7s(P4rR(oik=3SnlZ_bh3z^oEY@7svC8+|PKnH+Z%XAji)S)$ zVng190e zX3tqvCwrKkb2qJsW7Y3=be0Cu8RE_wKRfbMmoZz^LY1u_)=$Q~vnMcqvsXttF;n@L z9%oK8?ABaEJac7-oo8<`*$)r{xWh)l0p+h4u_Ejh{P2XY?6^gH;o3}M?k1TW8u+Vi zseUf)5`Pw!aF@8cXgd^X59wKSeMo0vj`me)laJ4@F|IGe+HiVE8uVAY^4X1p+Jsj! z%k&LU-T~k%{a~dNen1R0U7n=4y8!Qm7;J>|=%l?-c_hfN1->|nI`H8Zx9zLyCiKSM zenB(yvp&I#0Z4L4=Gf<;iSS2LTU0R}1BFe~FKNh|bUCq2z^6=v+ zr}gqInH*7T)u%%D*uIVTz5h6}QJmYhyvJ#=sz(~uwGoD~A^B~ur~twjIiW&+T~d!^ zCmkTLxr0mq`JjfLv(?NSZX{MBIvp*&!29l`!pSOyBK)C0-@Oj#@hFUUH`&u0r;@e;#3?7yZoV}=nrH_DUn`&DMC@*0 zU`%mXk0)9(R@?zsrR8e8krO;rw`T{-K;@XgeIJWYr$eIFgw@YYhDEQ>Vl$=N@h5sL)G&;k#mx}i41o-S=jw#XqPEw zV3$;Y)}FQ)ZH*Q<&qd-anw{hY^pg~!0IOvcoW~utCo3RKy%B(kJw)K)YP+L1e&ld$985b1&e%BS#kzI^$%_1E(ZsPIW68FOvKa?XUaEP=<`x zgl&PLPi`L7!jgi4gLhPU!R2=*{t7a$7yLR{&LUr0Vuhglcmu;vxIZKSYy2$w04JKs z+38TG5A4u#kKuNFe|2=btF2UDZNWED{{DA_=i>gpXLgI=D}9o}P5#*XuBMvKP;1B| zqv)_hM{`uT2d!`}tRr@eRXUdUef!_tb}u?TqwGZs`}{Mk9Z5X0u#Zg6yv}A(dbID* z%=`{YvN1frgPHD^%yfcu3-Rqxi!V>Udw5URoqpoH6Kzw*KGE`WcNRAl`cjjdtacBG zIj4KW9gS-+bmzqLD46#S5IMgr9WHmz!`y#Xi}wM?vySr~cIP7Gm$4g&g?LJdu%2_1 zYGexl)~nw!F)wUwCc4;@2PAD~-0io4hLT5kN6KLxqg0Zq`Y}o~t*Bc>$0+9Ew7#C+ zHK*{$=-a(S@&%XAjm+|GrSCj!c6Xc~YTE3dp=G>wIb?TybFz}~`Zp(=Y0Ejv|PoY zIe4-9Epef^V0Z)c$C>+9?NXe^C>8>o)aJt%9Gi81s|WZQS(Zz2e|a9y6=UsV_ukI5 zb-{$B*1WCPNQo@(Hq3DYQOcr@3;dtx5UgZTRES-ghg*vCkSbxF@!Q^hp2s*VdRtt8 zH(>gz+r*17-_jHBx=1($v|7Kx=XN5OGMswwCMnAmyHw03`SU+hKjKbeA9NA#&H3`E zVddu*EXV9v6xN|MKJ$qdX~-+T*uCzB-`gP$tYtMOe{~PYvF7!es2vkF5eC?C#*Rp@ zDC}azLkOj;7HPGZ6SYh($(s^ZppnT?WH$C&eM>G(FK?KgRsp|hYM_^KOxWLLXUJyH z9bGVdDi`Bw8#G@ka}Dfn)lokcGQrakP0vM6p*x*{eSnBsXh=Ahfmu}!DnLsBe5qvj zJAKPNzk{EV2g}4tZWX+X(e3T@l#rA7M7pA@2=Du|xOj7%L=Fm0m}0%N_rnQx^noJn zNaMPNGPzXttO0LW#ectN5In#=Dwn0X8=zmr76`PNy7-oxh!yQ{=GwOV`Pe7*5+us!r%vV?1{ui{cvK zuyO|4;R@>c&ED7gRl)Y++t~9i#@Mc37)@^&TPlceU5>1M6s~}mrZ8qMs^{~Tueg_U z%=B3N6K$%eC0nQYc`HI`K5BES*9OX-`4`re#E1jvFXs_|ol(T2v`dz$=&OI-$+}@| zuVj_Wx7<*&sdH6F5`5bGjAhT??nqx!uf$t2F6@!n>~|kBNN$ox!kX*)2xWC zakezM>pIb~&I>=`ZQZ+b83Q{Pk<8o8d6xAV6Sy4ge%vHW7#h@i^(ag~)9sT(!MuK1 zV!bE-hS)7cB7DwiI2>CJ=kz^e81ht-y64%ancDdfoLXkcH;jETHn!sJn+#PJ-Z))^ zcjS^>pKgetClTGTnGL&PrfpYE+irBiSLZnnlki3uIN6;FeC z_zK)D-vO_|z3_YZtm;892j7Kn!N=w6U?+S+bqClEXW*k+JNaMD&!u6masM-_xeu>_ z`*M$Qcnt1=yWj%+80Izb+!x_2+zKCr55cEkzQ^M>k;gaTckm7PBg}JgpNH@hcm#d` zbDwAM5S)iE!&hPcA0*Gu-;V#(uZ71v7tenQuJ7H4<6|&?6W^oxKk+j>PrvH=_40SW zfa|L2psG7Jzcvc)W5~KT~X(D2Uo=M9<8?om&K{QQ788WE8>U!QFrm?LW|-|Th!AB zf-}7FP$lKCZr*I@`{Cdcg9gnW3Dy`$sFfjw{)!6>GgM|cp+1Ha`X+Yq#zIxG)ERY} zHxOD7Qw%gTFJ>7~sGA{&elifyuXDl0^T9tCf=PxO8emYMIR+4FW!emTV=qZrJi7w!GkIcJ2cP0LZi2XB{n1i;*W`jD|P*8d(xFXJ|DW+e>di_;xVQLFgTbSCy z)E1_;Ftvpxbpo}AsXa{XVQLRkdzjk88udN3g{dt}ZDDE$QyZAtz!ZN|{7tbp#oiQm zQ`~))aVy2#6my?%Tu<=@Y7h9F&5XAz~i`|M_|xce+-t$6!9XRTQK zoMx>!`^;voIGf_^^ONg}wJFvD8*wPNlwlC@&)vy*k6FOXvH&j!|t zxzBXgin%G~rkI;z?(>(QQ`}8)H^tp&I^S2^{n@};G56;NYsK6YbAKjqT`@Ps+@C32 hSIqtS!dmgRdc23Z2#o~nmH1j5701MJ@r^hk{sUgOZy*2w literal 2042 zcmZ9NIZRbi5QYa=+;>D&L`6ivry>ZrB0ToR1zE)vG!$wqh_SFV#1La)VXTa?rm&ze zR1}8V%2-ekVyG+(F%-ta`R{x$F;4RHotb+%@7#O-JEx|qA?;4$Y# z*E!c2)Z)D0s&_qbop7CZb-GkfyBb|(sMeWsmAXn?M_l_|M_t7()iU>Lta>3e{a=Nv z$aUP6`Q0ub54p5nt)tS_=DO;ta%m2++_l%G-^)Et^-KTea7U{ainosN1 zHO-lcn+)dQKvoRCz{X{fe}g^CBcFwnD>t0JF(6{{nkhPm01KZZYH`ZPdr%cUZA5>M!6NtX?1Wmv8~5Hbi|4&coJ?QGWxM;OM5P|Ay~3M}B`xFts%} z3G=s+!@lj2&%lBmk-vhAuq-F)kKh!{+Zpwza2i@})SKZr%-t3BMffZ)@{!%aKX7bM zA9-qD@GWdCh}}ZDu&-fP!=8p64f`2(Gi+wq%&?hZFT+-boecXJcCr8LVc5X1fwBF@_9s0} zkIS|j+ieBB1Gd-LR%1JpzNcH*#-t}IXWNpVq@3+Zdb4u2Bk9x1dF@GmQm!7QP>)vV zOi+{u>?BmbQ&a}*6s&^Pum+xn>YWPpM1}gHLg$V`=Ypa(`5R}JhIQoiFbx}EBW!}r zum!flHrNh3U?;o;)$0{k0?C=7VHf!|*bRGNFYJT;Q0I=~dcX$Z4VXCxZjs-Hci>$( H1c%{YQkiNZ diff --git a/tests/data/gpt2/ag_news_prompt_targets_document.bin b/tests/data/gpt2/ag_news_prompt_targets_document.bin index 60646247e5037a6b277473adb47a2864b90408dd..ac2ba952c82bd65ef4e38a63246a3219b11ab1c2 100644 GIT binary patch literal 976 pcmX@|hLM4Zfq~)3C_NeubVvscJj@J2zyz06zyJ>KQ9gCU0{|EXerT!(DtpRbfwje@;g`ZdiDK1&*b_1{`>vmGct4E*E!d@&hkCq zbID3S|2uq8$adH&u24J32V{1CC6!4=+R91c=BS<`xYgxQl%OQv1)J%>NDb+49|aOw zYC0w7!OJ_{Bwgo#?VU7a50^_uauTV7K*RUCySPU0W3LbGeN6rr3Hu}3LpWR-#}^-h zV*@U3dA$BF?qJhx-IB%O5O0!*o2)c80)RWj3zgznPDAE&HiHZ9MY+in+Foy8HakIt zU0R(akw$nqL2q=ESzN@pba;)QP96B$B!zD+sV6a*O;p=A;i{tqF2)Ia zg1hOCQa9OFyIBixu0v7YDSF=GCVxb$Z4$A;PtqCDMAIQt=UZ~s_PvI*Lw2|f8uGMi zqbQNo;(B`Q89Qi&gW7V4?xrM-E;CJr_n>$1*W6FiUV}sm2K5p;7+ml^H&NI!ULr3j z0P5*Q>DV*bA%pp66jhvv@9#E_4fHMf5B_(}3Ui%CB9plfdoJ)_33J59duHcQXwVuL zsM6tCjhiS9`@@e4Pu6*H_rqx^?rpE(^5R^viTDZW>HzYV>?WN_PN+oI#!Te^YIP0S zn;=zN?{Sk*@;03<%cLA4?0-ozi6u&(-ng;CT=!AF6TTgB!>=4xOG>g%Hq+7uc9tJ% z@?fKXxj0FoPYy9J(KNs#>3Sj)tN5nGJaD=!2=@Q^#g9~=#6NTg+e<`*XZoD7>dhva zsP%0Z`>VC8dhX-QYC0~u9HvUsg{9iPMwu8mCD96x|1#5_6&3n|X5_UvxZ{89Q>m1QhePXmj#69$Dl}tWeZ|xO9|IR?hwb`Q`saonoQr;A zI;wkF?96=(mIxU=Mj|y(q?1S$3~xJt|Hd>C-=k8 zYa$C1{v6%nf&VVIH4)JNzvj08JFEO}8Gp|#O~i%y<6kpN6S?dB_nD=M4A%XpyizA^ ziT1**ckQreP_jltel2Wl(?n8|g1ZL&cdVJ|tanetIViNS zHJqYeM1CQBZFzJaDIqOS{v@gR zZ?th-fU=2n(VjKg!C$+vIX_Ee*1IPZO=JkL6)YRF$OqEG&Mg1!z}WZY2k*q>Zn&}d zHH{rUDeh-)BIDq@Y%O`F!(&gVh2#`39Fv;JY&ln*N6zS0rYzAv$Jh9c6;i1k{kw@6 zN4~_dJarV$@QC?!X_nY2V&{a<&{hEVqc*{(XuI|pD~9yAuD*2%CEF9#S>ad@)8Sm= z?C_`N{%?P&g)J@uv)i8R&SW)_qqIPnX#m(LK3cy?ZBc}B(I%D%R<-Z7!`J@V(F@^b4S7q@5aaM#ARS-e z9k4>@L5u#_T(NHpB78|(o{&-s#75X7PQcv0QPUXseYEej+v78hmJxmuIUmSSMZ31< zx(LAo4q=Ph3+@z_rCFs`ald2Ux&$E(Cmd5~$QM~GuWyD^q8+aDEUSL-REbbZcT){_ z6@vH-wMQ6@Cmi9t4~rbdU}?p#rb_HAt_eT)VCPYb)sX$&2p;dIW}n$8E|NJyur~OS zbz!WYt&t~2`DZOqNu;}idDV1zy~Xz)&kj05Z9@{_U6mK&gG6$!ozs7Iw^K9`O1S&F zHux>T(<9%)?Aph&31am6(D)#hUz>=exXkr!rT(ypXcKU0+r4BBd4;P5OVN8$VkV1# zJYZfgk;~fG5Pem(Tim^x+Mq#QjK1}1S0`NB_GT;f-jkBk?x6OApouz4`=aP_S1FRrUi1Ks6dH-t$->Zl({NYN6@BC{!9a3FTK->&n9n`J0!$Xv8CJngS zXR3LYD|$|rNE2!)f?4B$(Y8+mML6-A3zS0={d}*!KVq+8822W|o4=+OB1H$h6xg4y zr!F|k|HUmaEz#WbvK*f)E>pC_%EqwNN=qY{c;9Odd19n#bX~PDL-(0#q$v-WZr5$> z?*=@1v+lO_0}j2YS$`*f^GUEBrfZpnSVr)@?mIEbDfj(YezEh(DGr!#$Jjt*A;Jr~ z*QIEUlQJKRo#2I1s75<8FHTpl3zNtZwN7B~`CglLoNuo@kPaUOZ54l*v=noHQS2d< z;8h-_AzeGGG^9)Mkw}SLK;O8nn_VEnYL!{{71UrAd4N{y8tQxPcBHSQg%|1fJ8$c* z+bVSf6%4V-N#F16)t+U+;kskdRe=r&*z+2CuOK{W&}!f6=7I|fo2RlBd|apK26Nlr zkH71;37n+^axXYU{AAHp>$Jl>!3%#Wj#7b+rfLO4GQFIe;0MfV1-rWj@h>=Grt8HV z-&#-Uyv=Z1*Wry{{c2&Bct9n>P}4Sp+IUULPBFgCa;pdIuuHLu7L4)22JIl1@3m&` z)WdsooY>_okBccCU@RWZPAxC&y*W3mUKnV=jz#l@SmFv$)R<9Ia9QlIJnmzZ%k0DQ zq3^Y`mmLqc0}&ozp89&;ef#n7@q9jbV54k@D+U|5HAAt&84(@)oC#2XdHg478G7_2 zvFfX4`ub^Cms!!}O>$*djb5vB-OSTprxXj2fc?f0TAt|LB== z_-tPeB6;+uz@s$7a)ov@6U~NQPGI`^-pf>rB);YMbU>Cd-@DYlvii!yuC_j-OocMYX*70$z8*s2I8zr>da*9&Xl;*bP562bb8 zV(B4#7Q5W^6>RU2q&677D`g0Fb?1k$Txx`Dlqpcll-;dl5SEi|q|(m`N9c~I?ee?2 zBh;-L&|4%=U|ssfRY6&Ni_(0%1EZD6WO1IVWk@6soc_rv8gf3U1+~Z|N;pOL^S_~I zCE`;ra$CiE{ESv^6@BkAn3x9M?2!l4rmcxI4ZPELv-SZv!J(PQyI{ZlGa(LR|2KOc zROnZUdm|+BQs52cCa4~iLu!;R@CNLsF0CCjGx2Gtnmq=)J{Lr(e?hD;2P^!yC9A|M z%3?U|H-s|38PG}{5zBb=+=W7Lmhg*f-93|x8zJ3aF zKvM47b~3qUbi&jWiQI?HkRYFs0J5zIQZ^~)nQQO~7j&n~)h>FNb~3EP+$?do97@z< z3e5u#B7z|_>%ZkO93sL9Lmm{y(pGY5h=)mKxLHSRnANjI1=gWix_lWxpLQC;l@IW@ za$O(KMAZEAsJ3EYP$lgP_voIH;g(G966V;x5Z5gqPbzM-6Z-RaR4y16Fco{rzObJ@ z&$Yt}v6=4BOy<9)lQWC4YqINI5GxLVFS*}fRZ>Yc(;DEQ(N3;s=G%^MQ)R?bgBvRtv0O9?Y z*eZLK!}EduPj`e)wu@NJ-Gu>UsY)jOg;cHvPa-Chd1WJdq@+F5k;On6+P1D4PYzIj z4C6A`ZdS*5Vd6lRy(RPv(H>&lub+e+Mdy$#IH#A$#>gbniJV0a=5m^bL`!+(6upZz zDXN+85$)PiBtZDLmk=R&+u3XwIY)iuqV;v*|5m7RqF-M;~bVJljpB?lRWx@pQB<3-Vzi#&ZsK<)5Ng z{^a%@Q$=sg61RX6mMK=_lT!n2*kh!@uR=EYBU3|Mn#&ly9+0b-DRwA2A?s2l4CYge zOVQ$9xb9FNX15Ea%+RH@?V%e|~R&jmXAt`jIy+yYG@}kL4`^$8eu@(c-N)nCx(!H2Q$LK>`c^8DU553T+(%f%@($I5DX_;ccTZ~ibQph<6 zqnFR#$K=-27Y-Q zx>inln4)iI8>xl0ZRaIics^LkP`VTTKz4SK2|HBF^YluD9qPGM`VdrzQkd1oxL+y9 zo=;1SHo|(#kN6_3+T{TA2H!h-f5D#d+p|x?fA!1hSoo9&6t5hmao7zciT{w82Os6l ztzf>4qMq{3z<)k*Ch!(F*Hft}6JP0NrK6O?&{Bu#X&4A5&WeoGEODOT@X0I$ci0Kf za$O1;vde>iAg9N2LmsSxu}@xxYFcFfnv;o5br~G^eHP>@KS4YD-cy|-I`<;3 zEYx+)jb;8iL*XE0nEl3CY-AvuhA3nd7`y2OE7v>Y59&*!vv7hdT{>UihSmo!^Us5m z`a0-i%WB7}PMW0#R54K23Yc)!YJA z*U-_l3=vN@{SZ4s_EyWPf6<6$#`r4yMe{+2WP~q`=0!cFazKt0m?)8B!z%dm+ZEHawIxC{NESiuj4T5%S} zW<$K)OfN$u;yV_LYYbTsK^;P|MjVUbu1+3H+{jifph zsUtA=Bn4bkjpFLDDra-GFgsun|Fdcx6p7DT!c5qYa)UcN;AK1!|DjE>cv3hsJC%Tc z(Rer0%A>eJKVR;mfrf{OM9*2A@O$W0>?m$Wx}e(1PA>`HP;o0p>=9owb+rlT?U}@l z_Awc0q40a>DhT$$SI$BeUbzSd?0kFpKn$Q6Q%JyyT*oEVDct zfIa=Td4pf@z2C9U18rYU{)DbY9cSxnT~-J+uZYegOyfhY?xFVSSS{Qhm=fTEu^iKq zc~>dipV)xt>rxNqZ!PbG{s?VL_R?}IY1pfIeF4Lym77QbtB_l@0eU+|ur**^yhPIQ z#9WwZwx;;j>n!MPR^6HSn>B0QHpJ-l!|Rngz7O=y@7j-P@z46{&SBy}Zjt{b-VVG;-J zZjtJu&rJ78`&5T_*NJTS6bN4@9?RP-0U4ijj)u~#ie33xYS%@3bR-= zdC@F?A^Nh_68R@OCZHOuIQH018m><+@1nDw-tHX!1}m_gmA zmmm)lveM!KZ}Se)#73<1tfKYB?u~J>iiBc65fYFKEki!*@B)dXWVnE7Et4I7R?770 ziX8x}!_S)bV=heAFbO{^@9>SH7m_m-$UL}kb!Hvkr+iOw8FWRioG0M4KkH{taJ%un zHk{M{GE6Qn5rSob-Wnzf1LacKjq{Flspb1nx_rDHzFpYd%@DtNGG>XLMmyYgN<`Ud z6qMK=;9g4t4=x`B653&b5ho&=A+#@|vlbaHDAs4bAtEOo%~kTFpps%X^nZr`k!Nx9 zsv;!vo8GPI-RPY|ZRC`0y!{!&uZkk&_q64I4st=wm_HGILqBx-7g;-|Pe7h7DR;Wl z-(Mor4FN;$AhL47;>ELh)(gL)O{LMwG~1f?4N!-+V&Ah)dX~5=jCtn8L0Q1ou0!M! zhF!>bo;~p-T#YK!S?ljBxK^^+zW88L*?*vYg$lV9ey_Mim>U*MuEa8iq7n9mixAjh z+PL?1?C)UXvj*g;kVCke7a2Af`M%j54jE1(Yp{-IZ{=a8-J#mj3c6g|IZ@w(jLRw% zSH_Pqv;Q}d?H~x}?K|Kz!3HY-6s)UZ}Xx#ZoMBqJf+Wn(xA@aa3*cY%grbHN^c7ZdXK&aF{iv2>& zFO0))Sx?gn0#Cx3P#eOgN{T1Hp0^x!!B4D4^K%5&0r`^fU6shIu zI90Olq_G@gh{uWZdHA|No={TD`^=VhC{s>S^fb8Cj0t}zY}I*TvF4(VRmtzROJojx zQ!2;12}}k~rO2B&=PpE!qie`28ZBIwt_Jw_y+7Mb>1HFxq}&Yvw}hO2`SMkc%?7T} z{RJZ>HV?2jPXbu9CwrK^b2qGrW0mq|be0AY9%9%UKRfbe7crC6LY1u_)><}2U{7FN zXRnHMVixnQL7X|!uv2pdanPl0cAmY-9%qU}(mJ)mb%_aU8ydE8fJPCh!n#<;c&>&VF=Y0zKo z%4at*Y7<_`EYmkYc?W>6^n;a7_!05fba|5E_5!>&W3UmcNLw z+_tZ(o6tLZ`vuVeJOLLn2EHZJj&}Gt=7{H*{V{9)6>`taM~;lRjvO>{`tmHyp-f(< zjE0S)F;6jmZ>sU6{_k=Y4r*fS3NuXzN)(`L?@DtMqaz&&>mW7|x7pjp{)z%pR(&FLm+jkl-*$*2>&3Zk%X^#@t9qni9UNg88)n1<`O zv>64vI^^LUCK2=_zZcT+b0kLoC8$H}I`!FN_}nl#{%J=`jT7N}`T`>ADX<35(tvo) zgDlNj|7*zWPDi#}1lAdea31@DL+Y81Yuc{FI>h&GMbeJUT)~*$J2fJ?7SY4)g!#Q) z6_4-8eCGOmjS;&Fham87#LOe)UN~b~j&>Iz4sUM_&)pN4fJ~GdXROmX#!G3qDqyQ> z&F4)(28zESnl8X9;3f-TB>fVruLytY&vvf^dNd02ft&2=jni0L0b-ovXg{WHuaU^S zN&oI9*WpAv82DQG1Sewm1OsD=!+Jc{mNEJcd~>-PZ!rZA)$Q8GGF3T@86SfY5JOVA zZ{-{9!^21QaT5jkSlJH0-Dgl@Y=lP`IL4%wLkPKMXWz%-)9H|?HDT3slVQ$=XShj|jool~C|`v){Y6-h zlOARXH#yYP0TENPu*1sGE>q0FPOJc}KW#DE9xZZ~i^TanJK=NaKPf^1R@*8#i#uyi zRzR3~Jpf4uBYnSl0_j`NbFY1?e(CwD9(D*1_QI*C_oLV;J-Q!Q3zwoD$bI=M@^J3X z?!u96I=mybTm7xNl2+)`C)8lx8;#n2!mlxC$N=qfwFglK&+WqMna@3kcefl}MCgoD zKo6XdoH^CaS<<&Yk>tl{f8Ec9GGyl_YzYi~e1oeNmJ|#eysgR$F27UpmykI==hwk< z8oAyQD+JxeTPuFT-5~*p7iQ6WIMq$gPKPpmV274FA2;LstE1apZm#6MrE3emiSqZq zB|H=NcN?%<7hma<6mIh8?sqlSbcR|(9vDT39Xgt$!aZn}GhrRETdmTu+R?h7-fs7z z(=*Cmw6M=V!`hL=Lks)J56#T)pd{bl)eoOi5k%Gk$RU+m7}twLXFa+6i=0WoKEZ@8mzErukWcwPnb-T@-#m!-qS?s-^C zPHXWF=6Kfm-a)27g#0pg^Rp07F%i~sZc>fx1;ASMJ0|9>Ei+=@c3kYq1Cll~?&jM- zL&*cYgXXY~Q7Xw){TQX0R@5z`V-)l7hC@BQZBF5j(6@Vu))JgrY+|!ldtfpRrJww!6JK-)s>m9p{7QRl!LD_*5lDF zKHvLT9bbm1quiVf9q7Zw4PZI&*dJSX#XQLFr6G}n7pvbA7m5ppH$Z>9tFT$S6sJ&% zg#gF3`S1nDX29R-0e(R?=tA5hPvyB{tgr08;ixQjQXnE#EUTB(i86zNjMd?TED^P zwj=K|oOm0e4gFvWUn_lFbg=p#kik;XL(WpbhHX#?JHi~oMtAh?Hn zRxV3(H$cCLO%P}^b!lCGB39CanXB9G?OWdC3E?m7c&-=zp8Plqz)IKtgPEuO-*euS z++^WKfE9j26c)KZ{Y#StNAR{nN!2m^D$EC6$fCFgIH;U~cDaQ5ezW_vepRrw_$K!0 zi!nFUFN~(wj4hR_b)62aeh{vJm!>ebF{5_sm4!Ek7vWvTB-f|wBIt2Mp=`FrZUAcA zRnxZXo$%FKnXo4wE_TA8EiQN#bo^$R#+i_f{L^?mb|T!=vELi;tuK|)&D_$U;XTSnO z78$Y3m^CJ>v%yI=ImH&I+2#y8oMo4D>~WrbE^xp_4u>(VMRsW@)5tN;fCYvuGGduA zYfM;YgH5*BW`|w&*yn(mRqo4xAtT02*kF?_w%K8q%cuPQ(Ed1nIFn0zR@=CSYnkOb zuIC1B1Uc$#N; zmgjh$AusSEFYz+3@G7sd$m_hpo4m!_yu&d@yvutWXNmXufDc(_g^&1{PgvzsK4Z-1 he8HD|#n*hpx2*9U-}3`M@)JMv3lo0jH-6^?e*$HqMCJeh literal 2042 zcmZ9Nxla{Q6vhvVh=?2RJK}~2xFMT}$i9mT3ix~?B9&1>j4>8ghQb&NV~mA`r7_0J zSXlT65GyJRVu%H?FoqZl3k%OT=l2rhNxt{H=giEVxpTicucxP@xubc|EXyKt#~56P zN1RQrQ?8S!%X!+>;=1WN=4x>Dxm0g;wY$nuqw}z<%vI_-=-TZ%kG$2wfMTy3tqu5QMf z&)<)wTpH^>L#|<$p5=r~??r3OtW$I7oXak~2VE!X9qQf(T%9hhwc4fgbzJi`ARX6O zYu5YKG0mBYQ3l`OcvcMN;P9fz7vSXL$p664*^$>T34Vm>oXF?kjip=%2XiBT2dkGw z{v7^&z`8Y&=M)5I zU}|mT({OBEd z!QXIdd*tOif?wb>ncSnkFoC^+>0OZ*6a}BcJel6(0nFVK`6Kup-rF1XQhhKU^YG68 z$afS6-@;Ov+v72ugSShf{t4Ep33yDyVSNl9voKXg4ky$gJQiTTK5&l@uuhHQpWrhv z>~GlKu)Se-!{&y)4O<&FHf(Fy)c&)lVL!umhTROi88$QQW!THGlVKmjE`~h}TNrjQ z>|ofyuz|7tNsrU=3wZ}@d(z`nVw;U^HMTYBgF4Q3COuL)+m`faZvy`(fN#9n^ zHY9ysx%!+!{Y#-9rl<(maaakfU^T3PwNQOlQ5Q&hu7>sG>aB`~ Date: Mon, 4 Jul 2022 11:53:56 +0200 Subject: [PATCH 296/297] Get pretrain on non causal mlm script --- finetune_t0_non_causal_decoder.py | 4 +- megatron/text_generation_utils.py | 4 +- megatron/utils.py | 77 ++++---- pretrain_gpt.py | 6 +- pretrain_non_causal_decoder_on_mlm.py | 185 ++++++++++++++++++ pretrain_prefix_lm.py | 6 +- .../test_sampling.py | 6 +- tasks/eval_harness/evaluate.py | 4 +- tasks/zeroshot_gpt/evaluate.py | 4 +- tests/test_tensor_parallel.py | 4 +- 10 files changed, 244 insertions(+), 56 deletions(-) create mode 100644 pretrain_non_causal_decoder_on_mlm.py diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 0b649970f..58b3b1f8b 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -8,7 +8,7 @@ from megatron.enums import PositionEmbeddingType, AttnMaskType from megatron.model import GPTModelPipe from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_packed_attention_mask +from megatron.utils import get_attention_masks_and_position_ids, get_packed_attention_mask import deepspeed from deepspeed.runtime.utils import see_memory_usage @@ -73,7 +73,7 @@ def get_batch_pipe(data): decoder_is_inputs = data_c["decoder_is_inputs"][:, :-1] # Get the masks and position ids. - causal_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + causal_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py index 7a98b5d35..bd0ec59d8 100644 --- a/megatron/text_generation_utils.py +++ b/megatron/text_generation_utils.py @@ -26,7 +26,7 @@ from megatron import get_args from megatron import get_tokenizer from megatron import mpu -from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.utils import get_attention_masks_and_position_ids, unwrap_model from megatron.p2p_communication import recv_forward, send_forward # These are needed to unwrap the model, would be nice to put these in megatron.utils if possible? @@ -42,7 +42,7 @@ def get_batch(context_tokens): # Move to GPU. tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda() # Get the attention mask and position ids. - attention_mask, _, position_ids = get_ltor_masks_and_position_ids( + attention_mask, _, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, diff --git a/megatron/utils.py b/megatron/utils.py index 6f3a0fa41..4f67c75c8 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -151,7 +151,7 @@ def check_adlr_autoresume_termination(iteration, model, sys.exit(0) -def get_ltor_masks_and_position_ids( +def get_attention_masks_and_position_ids( data, eod_token, reset_position_ids, @@ -159,6 +159,7 @@ def get_ltor_masks_and_position_ids( eod_mask_loss, prefix_indices, loss_on_targets_only, + ltor: bool = True, ): """ Build masks and position id for left to right model. @@ -177,9 +178,10 @@ def get_ltor_masks_and_position_ids( att_mask_batch = micro_batch_size else: att_mask_batch = 1 - attention_mask = torch.tril(torch.ones( - (att_mask_batch, seq_length, seq_length), device=data.device)).view( - att_mask_batch, 1, seq_length, seq_length) + attention_mask = torch.ones((att_mask_batch, seq_length, seq_length), device=data.device) + if ltor: + attention_mask = torch.tril(attention_mask) + attention_mask = attention_mask.view(att_mask_batch, 1, seq_length, seq_length) # Loss mask. loss_mask = torch.ones(data.size(), dtype=torch.float, device=data.device) @@ -198,42 +200,43 @@ def get_ltor_masks_and_position_ids( # Loop through the batches: for b in range(micro_batch_size): - # Find indecies where EOD token is. - eod_index = position_ids[b, data[b] == eod_token] + if reset_position_ids or reset_attention_mask: + # Find indecies where EOD token is. + eod_index = position_ids[b, data[b] == eod_token] - # If the last eod token is not the last token of the sequence, we suppose that there is a partial document - # We treat this case as if we add an eod token at the end of the sequence. - if data[b][-1] != eod_token: - eod_index = torch.cat( - (eod_index, torch.tensor([len(data[b])], dtype=eod_index.dtype, device=eod_index.device)) - ) + # If the last eod token is not the last token of the sequence, we suppose that there is a partial document + # We treat this case as if we add an eod token at the end of the sequence. + if data[b][-1] != eod_token: + eod_index = torch.cat( + (eod_index, torch.tensor([len(data[b])], dtype=eod_index.dtype, device=eod_index.device)) + ) - # Detach indecies from positions if going to modify positions. - if reset_position_ids: - eod_index = eod_index.clone() - - # Loop through EOD indecies: - prev_index = 0 - for j in range(eod_index.size()[0]): - i = eod_index[j] - - if reset_attention_mask: - # Prevent cross document interactions. - attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 - - # Prefix lm per document. - if prefix_indices: - assert isinstance(prefix_indices[b], list), f"prefix for a row has to be document specific, and consequently return a list, got {prefix_indices[b]}" - attention_mask[b, 0, prev_index: prefix_indices[b][j], prev_index: prefix_indices[b][j]] = 1 - if loss_on_targets_only: - # Last token of the prefix should predict the prefix_index id - loss_mask[b, prev_index: prefix_indices[b][j] - 1] = 0.0 - - # Reset positions. + # Detach indecies from positions if going to modify positions. if reset_position_ids: - position_ids[b, (i + 1):] -= (i + 1 - prev_index) - - prev_index = i + 1 + eod_index = eod_index.clone() + + # Loop through EOD indecies: + prev_index = 0 + for j in range(eod_index.size()[0]): + i = eod_index[j] + + if reset_attention_mask: + # Prevent cross document interactions. + attention_mask[b, 0, (i + 1):, :(i + 1)] = 0 + + # Prefix lm per document. + if prefix_indices: + assert isinstance(prefix_indices[b], list), f"prefix for a row has to be document specific, and consequently return a list, got {prefix_indices[b]}" + attention_mask[b, 0, prev_index: prefix_indices[b][j], prev_index: prefix_indices[b][j]] = 1 + if loss_on_targets_only: + # Last token of the prefix should predict the prefix_index id + loss_mask[b, prev_index: prefix_indices[b][j] - 1] = 0.0 + + # Reset positions. + if reset_position_ids: + position_ids[b, (i + 1):] -= (i + 1 - prev_index) + + prev_index = i + 1 # Prefix lm per row. if prefix_indices is not None and (reset_attention_mask is False): diff --git a/pretrain_gpt.py b/pretrain_gpt.py index fdd4d28be..f308956a6 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -26,7 +26,7 @@ from megatron.enums import AttnMaskType from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices +from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices from megatron.utils import average_losses_across_data_parallel_group import deepspeed @@ -98,7 +98,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, @@ -129,7 +129,7 @@ def get_batch_pipe(data): tokens = tokens_[:, :-1].contiguous() # Get the masks and position ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, diff --git a/pretrain_non_causal_decoder_on_mlm.py b/pretrain_non_causal_decoder_on_mlm.py new file mode 100644 index 000000000..243d8a471 --- /dev/null +++ b/pretrain_non_causal_decoder_on_mlm.py @@ -0,0 +1,185 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain GPT""" + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron import mpu +from megatron.data.mlm_dataset import build_train_valid_test_datasets, build_dataset_group +from megatron.enums import AttnMaskType +from megatron.model import GPTModel, GPTModelPipe +from megatron.training import pretrain +from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import average_losses_across_data_parallel_group + +import deepspeed +from deepspeed.runtime.utils import see_memory_usage +import subprocess + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building GPT model ...') + see_memory_usage(f"Before Building Model", force=True) + + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed: + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + attn_mask_type=AttnMaskType.prefix + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + + else: + raise NotImplementedError("DeepSpeed has to be activated.") + see_memory_usage(f"After Building Model", force=True) + return model + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ["input_tokens", "target_tokens"] + datatype = torch.int64 + + # Broadcast data. + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + # Unpack. + input_tokens = data_b["input_tokens"].long() + target_tokens = data_b["target_tokens"].long() + tokens_ = torch.concat([input_tokens, target_tokens], dim=-1) + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Prefix + batch_size, input_size = input_tokens.shape + prefix_indices = torch.full((batch_size,), input_size, dtype=torch.long, device=input_tokens.device) + + # Get the masks and position ids. + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( + tokens, + tokenizer.eod, + # TODO @thomasw21 not supported + reset_position_ids=None, + # TODO @thomasw21 not supported + reset_attention_mask=None, + # TODO @thomasw21 not supported + eod_mask_loss=False, + prefix_indices=prefix_indices, + loss_on_targets_only=args.loss_on_targets_only + ) + + return (tokens, position_ids, attention_mask), (labels, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + train_ds, valid_ds, test_ds = None, None, None + + print_rank_0('> building train, validation, and test datasets for GPT ...') + # Option 1 of data loading using --data-path + + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + sequence_length=args.seq_length, + noise_density=args.noise_density, + mean_noise_span_length=args.mean_noise_span_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup)) + # Option 2 of data loading using --(train|valid|test)-weighted-split-paths + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") + + print_rank_0("> finished creating GPT datasets ...") + return train_ds, valid_ds, test_ds + +def command_exists(cmd): + result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) + return result.wait() == 0 + +def git_ds_info(): + from deepspeed.env_report import main as ds_report + ds_report() + + # Write out version/git info + git_hash_cmd = "git rev-parse --short HEAD" + git_branch_cmd = "git rev-parse --abbrev-ref HEAD" + if command_exists('git'): + try: + result = subprocess.check_output(git_hash_cmd, shell=True) + git_hash = result.decode('utf-8').strip() + result = subprocess.check_output(git_branch_cmd, shell=True) + git_branch = result.decode('utf-8').strip() + except subprocess.CalledProcessError: + git_hash = "unknown" + git_branch = "unknown" + else: + git_hash = "unknown" + git_branch = "unknown" + print(f'**** Git info for Megatron: git_hash={git_hash} git_branch={git_branch} ****') + + +if __name__ == "__main__": + git_ds_info() + pretrain(train_valid_test_datasets_provider, model_provider, forward_step_func=None, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) diff --git a/pretrain_prefix_lm.py b/pretrain_prefix_lm.py index c531db863..b2930b531 100644 --- a/pretrain_prefix_lm.py +++ b/pretrain_prefix_lm.py @@ -26,7 +26,7 @@ from megatron.enums import AttnMaskType from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ +from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ from megatron.utils import average_losses_across_data_parallel_group import deepspeed @@ -98,7 +98,7 @@ def get_batch(data_iterator): ) # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, @@ -140,7 +140,7 @@ def get_batch_pipe(data): ) # Get the masks and position ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, diff --git a/scripts/test_multiple_dataset_sampling/test_sampling.py b/scripts/test_multiple_dataset_sampling/test_sampling.py index 2d5326c8c..8bed75c2a 100644 --- a/scripts/test_multiple_dataset_sampling/test_sampling.py +++ b/scripts/test_multiple_dataset_sampling/test_sampling.py @@ -25,7 +25,7 @@ from megatron.data.gpt_dataset import build_train_valid_test_datasets from megatron.model import GPTModel, GPTModelPipe from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import get_attention_masks_and_position_ids from megatron.utils import average_losses_across_data_parallel_group import deepspeed @@ -117,7 +117,7 @@ def get_batch(data_iterator): tokens = tokens_[:, :-1].contiguous() # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, @@ -144,7 +144,7 @@ def get_batch_pipe(data): tokens = tokens_[:, :-1].contiguous() # Get the masks and postition ids. - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 68dd649fd..cfa29a205 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -24,7 +24,7 @@ from megatron.training import setup_model_and_optimizer, get_model from megatron.mpu.mappings import gather_from_tensor_model_parallel_region -from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.utils import get_attention_masks_and_position_ids, unwrap_model from megatron.p2p_communication import recv_forward, send_forward import pickle import json @@ -185,7 +185,7 @@ def _collate(x): def create_model_inputs(self, tokens): args = get_args() - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, self.EOT_TOKEN_ID, args.reset_position_ids, diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py index 090533c24..b17c76848 100644 --- a/tasks/zeroshot_gpt/evaluate.py +++ b/tasks/zeroshot_gpt/evaluate.py @@ -26,7 +26,7 @@ from megatron.checkpointing import load_checkpoint from megatron.model.gpt_model import GPTModel from megatron.training import get_model -from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.utils import get_attention_masks_and_position_ids, unwrap_model from megatron.p2p_communication import recv_forward, send_forward from tasks.finetune_utils import build_data_loader @@ -72,7 +72,7 @@ def process_batch(batch): tokens = tokens_[:, :-1].contiguous() # Get the masks and position ids. - attention_mask, _, position_ids = get_ltor_masks_and_position_ids( + attention_mask, _, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py index 25921c12a..a0d257404 100644 --- a/tests/test_tensor_parallel.py +++ b/tests/test_tensor_parallel.py @@ -18,7 +18,7 @@ from multiprocessing import Pool from megatron.checkpointing import save_checkpoint -from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import get_attention_masks_and_position_ids @require_deepspeed @require_torch_multi_gpu @@ -98,7 +98,7 @@ def infer_model(args): def create_model_inputs(tokens): args = get_args() - attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + attention_mask, loss_mask, position_ids = get_attention_masks_and_position_ids( tokens, tokenizer.eod, args.reset_position_ids, From 606fdeb5418b32897f01576cfea867cd095a1510 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 4 Jul 2022 11:59:54 +0200 Subject: [PATCH 297/297] Test --- pretrain_non_causal_decoder_on_mlm.py | 9 +- tests/test_training.py | 113 ++++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 6 deletions(-) diff --git a/pretrain_non_causal_decoder_on_mlm.py b/pretrain_non_causal_decoder_on_mlm.py index 243d8a471..385c2444f 100644 --- a/pretrain_non_causal_decoder_on_mlm.py +++ b/pretrain_non_causal_decoder_on_mlm.py @@ -13,21 +13,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Pretrain GPT""" +"""Pretrain GPT on MLM dataset""" import torch -from functools import partial from megatron import get_args from megatron import print_rank_0 -from megatron import get_timers from megatron import get_tokenizer from megatron import mpu from megatron.data.mlm_dataset import build_train_valid_test_datasets, build_dataset_group from megatron.enums import AttnMaskType -from megatron.model import GPTModel, GPTModelPipe +from megatron.model import GPTModelPipe from megatron.training import pretrain -from megatron.utils import get_attention_masks_and_position_ids, get_prefix_indices, reweight_loss_mask_ -from megatron.utils import average_losses_across_data_parallel_group +from megatron.utils import get_attention_masks_and_position_ids import deepspeed from deepspeed.runtime.utils import see_memory_usage diff --git a/tests/test_training.py b/tests/test_training.py index 4ceb2aca3..4aad5a234 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -725,3 +725,116 @@ def test_skip_train_iteration(self): train_iterations = range(1,10) for i in train_iterations: self.assertTrue(f"iteration {i:8d}/" in cs.out) + + def test_training_non_causal_decoder_on_mlm(self): + # all in one test + src_dir = self.src_dir + data_dir = self.copy_data_to_temp(self.data_dir,"gpt2") + + output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False) + logs_dir = f"{output_dir}/logs" + Path(logs_dir).mkdir(parents=True, exist_ok=True) + + pp_size, tp_size, dp_size = get_3d_dimensions() + num_gpus = pp_size * tp_size * dp_size + + n_samples = 200 # about 37 iterations + exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume + args = f""" + --tensor-model-parallel-size {tp_size} + --pipeline-model-parallel-size {pp_size} + --distributed-backend nccl + + --num-layers 2 + --hidden-size 64 + --num-attention-heads 2 + --seq-length 128 + --max-position-embeddings 1024 + --micro-batch-size 1 + --rampup-batch-size 2 2 {n_samples} + --global-batch-size 16 + --train-samples {n_samples} + + --optimizer adam + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --lr 1e-4 + --lr-warmup-samples 5 + --clip-grad 1.0 + --weight-decay 1e-1 + --fp16 + + --log-interval 5 + --save-interval 10 + --eval-interval 10 + --eval-iters 5 + --checkpoint-activations + --exit-interval {exit_interval} + + --merge-file {data_dir}/gpt2-tiny-merges.txt + --vocab-file {data_dir}/gpt2-tiny-vocab.json + --log-path {logs_dir} + --save {output_dir}/checkpoints + --load {output_dir}/checkpoints + --data-path {data_dir}/meg-gpt2-openwebtext_text_document + --tensorboard-dir {output_dir}/tensorboard + --tensorboard-queue-size 5 + --log-timers-to-tensorboard + --log-batch-size-to-tensorboard + --log-validation-ppl-to-tensorboard + + --log-level debug + """.split() + + ds_args = f""" + --deepspeed + --deepspeed_config {self.test_file_dir_str}/ds_config.json + --zero-stage 1 + --deepspeed-activation-checkpointing + """.split() + + script = [f"{src_dir}/pretrain_non_causal_decoder_on_mlm.py"] + launcher = get_launcher(num_gpus) + + cmd = launcher + script + args + ds_args + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + + # 1. test training from scratch (no checkpoint) + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test deepspeed is running + self.assertIn("DeepSpeed info", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) + + # test there should be no checkpoint this round + self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out) + + # test checkpoint saving + self.assertIn("successfully saved checkpoint at iteration", cs.out) + + # test tensorboard + tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*") + self.assertEqual(len(tensorboard_files), 1, "tensorboard files") + + # 2. test training from checkpoint: resume + # now do it again, this time resuming from the checkpoint + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test checkpoint loading + self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) + + # test checkpoint saving + self.assertIn("successfully saved checkpoint at iteration", cs.out) + + # test tensorboard (1 file from the first run, plus 1 now) + tensorboard_files = glob.glob(f"{output_dir}/tensorboard/events*") + self.assertEqual(len(tensorboard_files), 2, "tensorboard files")