From 67c08f09b6d9099082a302343633f6b1839aeb0b Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 24 Jan 2022 22:39:53 +0000 Subject: [PATCH 01/33] Reshape deepspeed checkpoint --- megatron/checkpoint/__init__.py | 0 megatron/checkpoint/conversion_utils.py | 26 ++ megatron/checkpoint/deepspeed_checkpoint.py | 278 ++++++++++++++++++ megatron/checkpoint/reshape_meg_2d.py | 104 +++++++ megatron/checkpointing.py | 14 +- .../deepspeed_checkpoint.py | 189 ------------ .../deepspeed_to_deepspeed.py | 135 +++++++++ .../deepspeed_to_megatron.py | 92 ++++-- .../deepspeed_to_transformers.py | 53 ++-- .../convert_checkpoint/inspect_checkpoint.py | 13 +- .../inspect_deepspeed_checkpoint.py | 60 +++- 11 files changed, 705 insertions(+), 259 deletions(-) create mode 100644 megatron/checkpoint/__init__.py create mode 100644 megatron/checkpoint/conversion_utils.py create mode 100644 megatron/checkpoint/deepspeed_checkpoint.py create mode 100644 megatron/checkpoint/reshape_meg_2d.py delete mode 100644 tools/convert_checkpoint/deepspeed_checkpoint.py create mode 100644 tools/convert_checkpoint/deepspeed_to_deepspeed.py diff --git a/megatron/checkpoint/__init__.py b/megatron/checkpoint/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/megatron/checkpoint/conversion_utils.py b/megatron/checkpoint/conversion_utils.py new file mode 100644 index 000000000..6bf2f2f96 --- /dev/null +++ b/megatron/checkpoint/conversion_utils.py @@ -0,0 +1,26 @@ +import os + + +def validate_files(file_list): + for file in file_list: + if not os.path.isfile(file): + print(f'Error: {file} is not existent') + + +def get_files(dir): + file_list = [] + for root, dirs, files in os.walk(dir): + for file in files: + file_list.append(os.path.join(root, file)) + return file_list + + +def partition_data(data_list, num_partitions): + num_elems = len(data_list) + assert num_elems % num_partitions == 0 + partition_size = num_elems // num_partitions + partitions_list = [ + data_list[i:i + partition_size] + for i in range(0, num_elems, partition_size) + ] + return partitions_list diff --git a/megatron/checkpoint/deepspeed_checkpoint.py b/megatron/checkpoint/deepspeed_checkpoint.py new file mode 100644 index 000000000..d2a38aae0 --- /dev/null +++ b/megatron/checkpoint/deepspeed_checkpoint.py @@ -0,0 +1,278 @@ +import os +from typing import Dict +import torch +from .conversion_utils import partition_data, get_files +from .reshape_meg_2d import reshape_meg_2d_parallel, meg_2d_parallel_map +from megatron.tokenizer.tokenizer import _vocab_size_with_padding + +ZERO_FILE_PREFIX = 'zero_pp_rank_' +LAYER_FILE_PREFIX = 'layer_' +MP_RANK_FILE_PREFIX = 'mp_rank_' +EMBEDDING_LAYER_INDEX = 0 +FINAL_LAYER_NORM_INDEX = -1 +ARGS_KEY = 'args' +CHECKPOINT_INFO_KEY = 'checkpoint_info' +ITERATION_KEY = 'iteration' +SEQUENTIAL_LAYERS = [ + 'input_layernorm.weight', 'input_layernorm.bias', + 'self_attention.dense.bias', 'post_attention_layernorm.weight', + 'post_attention_layernorm.bias', 'mlp.dense_4h_to_h.bias', + 'position_embeddings.weight' +] + +LAYER_CONCAT_DIM = { + 'self_attention.dense.weight': 1, + 'mlp.dense_4h_to_h.weight': 1 +} + +WORD_EMBEDDINGS_KEY = 'word_embeddings.weight' +ORIGINAL_VOCAB_SIZE = 'original_vocab_size' +PADDED_VOCAB_SIZE = 'padded_vocab_size' + +class DeepSpeedCheckpoint(object): + def __init__(self, dir, tp_degree=None, pp_degree=None): + self.dir = dir + self.file_list = get_files(dir) + self.zero_files = self._get_files_with_prefix(self.file_list, + ZERO_FILE_PREFIX) + self.layer_files = self._get_files_with_prefix(self.file_list, + LAYER_FILE_PREFIX) + self.mp_rank_files = self._get_files_with_prefix( + self.file_list, MP_RANK_FILE_PREFIX) + self.layer_keys = self._get_layer_keys() + self.layer_count = len(self.layer_keys) + self.original_tp_degree = len( + self._get_files_with_prefix(self.layer_files, + f'{LAYER_FILE_PREFIX}01')) + self.original_pp_degree = len( + self.mp_rank_files) // self.original_tp_degree + self.dp_degree = max( + 1, + len(self.zero_files) // + (self.original_pp_degree * self.original_tp_degree)) + self.tp_degree = self.original_tp_degree if tp_degree is None else tp_degree + self.pp_degree = self.original_pp_degree if pp_degree is None else pp_degree + self.old_2d_map = meg_2d_parallel_map(self.original_pp_degree, + self.original_tp_degree) + self.old_2d_map.simple_init() + self.new_2d_map = reshape_meg_2d_parallel( + old_pp_degree=self.original_pp_degree, + old_tp_degree=self.original_tp_degree, + new_pp_degree=self.pp_degree, + new_tp_degree=self.tp_degree) + self.global_state = {} + + self._sanity_check() + self.pp_to_transformer_map = self._build_pp_transformer_map() + self.transformer_file_map = self._build_transformer_file_map() + self.tp_to_embedding_map = self._build_tp_other_layer_map( + EMBEDDING_LAYER_INDEX) + self.tp_to_final_norm_map = self._build_tp_other_layer_map( + FINAL_LAYER_NORM_INDEX) + self._build_global_state() + + def show_2d_mapping(self): + print(f'reshaped 2d map ---- begin') + + for i in range(self.pp_degree): + for j in range(self.tp_degree): + file_list = self.get_2d_parallel_files(pp_index=i, tp_index=j) + print(f'[{i}, {j}] = {file_list}') + + print(f'reshaped 2d map ---- end') + + def show_tp_embedding_map(self): + self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers') + + def show_tp_final_norm_map(self): + self._dump_mapping(self.tp_to_final_norm_map, + 'tp_to_final_norm_layers') + + def show_pp_tranformer_map(self): + self._dump_mapping(self.pp_to_transformer_map, + 'pp_to_tranformer_layers') + + def show_transformer_file_map(self): + self._dump_mapping(self.transformer_file_map, + 'rank_to_tranformer_files') + + def _build_global_state(self): + sd = torch.load(self.mp_rank_files[0], + map_location=torch.device('cpu')) + self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) + self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None) + + def get_embedding_layer_id(self): + return self.layer_keys[EMBEDDING_LAYER_INDEX] + + def get_final_norm_layer_id(self): + return self.layer_keys[FINAL_LAYER_NORM_INDEX] + + def get_iteration(self): + if not ITERATION_KEY in self.global_state: + sd = torch.load(self.mp_rank_files[0], + map_location=torch.device('cpu')) + self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) + + return self.global_state[ITERATION_KEY] + + def get_embedding_state(self, tp_index: int) -> Dict: + assert tp_index in self.tp_to_embedding_map.keys() + sd_list = [ + torch.load(fname, map_location=torch.device('cpu')) + for fname in self.tp_to_embedding_map[tp_index] + ] + sd = self._merge_state_dicts(sd_list) + sd[WORD_EMBEDDINGS_KEY] = self._strip_vocab_padding(sd[WORD_EMBEDDINGS_KEY]) + return sd + + def _get_checkpoint_value(self, key): + if not key in self.global_state: + sd = torch.load(self.mp_rank_files[0], + map_location=torch.device('cpu')) + self.global_state[key] = sd.get(key, None) + + return self.global_state[key] + + def get_args(self): + return self._get_checkpoint_value(ARGS_KEY) + + + def get_checkpoint_info(self): + return self._get_checkpoint_value(CHECKPOINT_INFO_KEY) + + def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict: + assert tp_index < self.tp_degree + assert pp_index < self.pp_degree + fname_list = self.get_2d_parallel_files(tp_index=tp_index, + pp_index=pp_index) + sd_list = [ + torch.load(fname, map_location=torch.device('cpu')) + for fname in fname_list + ] + # HACK HACK HACK, should be merging i.e., sd = self._merge_state_dicts(sd_list) + sd = sd_list[0] + return sd + + def get_transformer_state(self, tp_index: int, pp_index: int) -> list: + assert tp_index < self.tp_degree + assert pp_index < self.pp_degree + t_list = [] + for fname_list in self.transformer_file_map[(tp_index, pp_index)]: + sd_list = [ + torch.load(fname, map_location=torch.device('cpu')) + for fname in fname_list + ] + sd = self._merge_state_dicts(sd_list) + t_list.append(sd) + return t_list + + def get_pp_transformer_map(self, pp_index: int) -> list: + assert pp_index < self.pp_degree + return self.pp_to_transformer_map[pp_index] + + def get_final_norm_state(self, tp_index: int) -> Dict: + assert tp_index in self.tp_to_final_norm_map.keys() + sd = torch.load(self.tp_to_final_norm_map[tp_index][0], + map_location=torch.device('cpu')) + return sd + + def _build_tp_other_layer_map(self, layer_index: int): + assert layer_index < len(self.layer_files) + layer_files = self._get_files_with_prefix(self.layer_files, + self.layer_keys[layer_index]) + layer_file_partitions = partition_data(layer_files, self.tp_degree) + data_map = {i: flist for i, flist in enumerate(layer_file_partitions)} + return data_map + + def get_2d_parallel_files(self, tp_index: int, pp_index: int) -> list: + assert tp_index < self.tp_degree + assert pp_index < self.pp_degree + file_indices = self.new_2d_map.get_data(pp_index=pp_index, + tp_index=tp_index) + return [self.mp_rank_files[i] for i in file_indices] + + def _build_pp_transformer_map(self): + data_map = {} + transformer_layers = self.layer_keys[1:-1] + layers_per_pp = len(transformer_layers) // self.pp_degree + data_map = { + i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] + for i in range(0, self.pp_degree) + } + return data_map + + def _dump_mapping(self, data_map, map_tag=None): + if map_tag is not None: + print(f'Dump mapping: {map_tag}') + for k, v in data_map.items(): + print(f'{k} = {v}') + + def _build_transformer_file_map(self): + transformer_layer_keys = self.layer_keys[1:-1] + file_map = {} + layers_per_pp = len(transformer_layer_keys) // self.pp_degree + for key_index, layer_key in enumerate(transformer_layer_keys): + pp_index = key_index // layers_per_pp + layer_files = self._get_files_with_prefix(self.layer_files, + layer_key) + layer_file_partitions = partition_data(layer_files, self.tp_degree) + for tp_index in range(self.tp_degree): + map_key = (tp_index, pp_index) + if not map_key in file_map.keys(): + file_map[map_key] = [] + file_map[map_key].append(layer_file_partitions[tp_index]) + + return file_map + + def _sanity_check(self): + assert len(self.mp_rank_files) % self.tp_degree == 0 + assert len(self.zero_files) % (self.pp_degree * self.tp_degree) == 0 + assert len(self.layer_keys) > 2 + assert (len(self.layer_keys) - 2) % self.pp_degree == 0 + + def _get_files_with_prefix(self, all_files, prefix): + file_list = [] + for file_path in all_files: + _, fname = os.path.split(file_path) + if fname.startswith(prefix): + file_list.append(file_path) + + return sorted(file_list) + + def validate_files(self): + for file in self.file_list: + if not os.path.isfile(file): + print(f'Error: {file} is not existent') + + def _get_layer_keys(self): + key_set = set() + key_len = len(LAYER_FILE_PREFIX) + 2 + for file_path in self.layer_files: + _, fname = os.path.split(file_path) + key_set.add(fname[:key_len]) + return sorted(list(key_set)) + + def _merge_state_dicts(self, sd_list): + merged_sd = {} + for key in sd_list[0].keys(): + if not key in SEQUENTIAL_LAYERS: + cat_dim = LAYER_CONCAT_DIM.get(key, 0) + merged_sd[key] = torch.cat([sd[key] for sd in sd_list], + dim=cat_dim) + else: + merged_sd[key] = sd_list[0][key] + + return merged_sd + + + def _strip_vocab_padding(self, padded_vocab_tensor): + target_args = self.get_args() + checkpoint_info = self.get_checkpoint_info() + target_args.tensor_model_parallel_size = self.tp_degree + target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args) + assert target_args.padded_vocab_size <= padded_vocab_tensor.numel() + checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size + unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size) + return unpadded_vocab_tensor.clone() + diff --git a/megatron/checkpoint/reshape_meg_2d.py b/megatron/checkpoint/reshape_meg_2d.py new file mode 100644 index 000000000..46bc34c27 --- /dev/null +++ b/megatron/checkpoint/reshape_meg_2d.py @@ -0,0 +1,104 @@ +from .conversion_utils import partition_data + + +class meg_2d_parallel_map(object): + def __init__(self, pp_degree, tp_degree): + self.pp_degree = pp_degree + self.tp_degree = tp_degree + self.map = {} + + def simple_init(self): + self.map = { + self._make_key(i // self.tp_degree, i % self.tp_degree): [i] + for i in range(self.pp_degree * self.tp_degree) + } + + def add_data(self, pp_index, tp_index, data): + self._validate_indices(pp_index, tp_index) + assert type(data) is list + + key = self._make_key(pp_index, tp_index) + if not key in self.map.keys(): + self.map[key] = [] + self.map[key] += data + + def get_data(self, pp_index=None, tp_index=None): + self._validate_indices(pp_index, tp_index) + pp_indices = list(range( + self.pp_degree)) if pp_index is None else [pp_index] + tp_indices = list(range( + self.tp_degree)) if tp_index is None else [tp_index] + + result = [] + for i in pp_indices: + for j in tp_indices: + result += self.map[self._make_key(i, j)] + + return result + + def print_data(self, tag): + print(f'{tag}') + for key, value in self.map.items(): + print(f'{key} = {value}') + + def _validate_indices(self, pp_index, tp_index): + assert pp_index is None or pp_index < self.pp_degree + assert tp_index is None or tp_index < self.tp_degree + + def _make_key(self, i, j): + return f'{i},{j}' + + +def _reshape_tp_dimension(old_2d_map, new_tp_degree): + old_pp_degree = old_2d_map.pp_degree + new_2d_map = meg_2d_parallel_map(old_pp_degree, new_tp_degree) + for i in range(old_pp_degree): + ranks_for_pp_index = old_2d_map.get_data(pp_index=i, tp_index=None) + split_ranks = partition_data(ranks_for_pp_index, new_tp_degree) + for j in range(new_tp_degree): + new_2d_map.add_data(i, j, split_ranks[j]) + + return new_2d_map + + +def _reshape_pp_dimension(old_2d_map, new_pp_degree): + old_tp_degree = old_2d_map.tp_degree + new_2d_map = meg_2d_parallel_map(new_pp_degree, old_tp_degree) + for i in range(old_tp_degree): + ranks_for_tp_index = old_2d_map.get_data(pp_index=None, tp_index=i) + split_ranks = partition_data(ranks_for_tp_index, new_pp_degree) + for j in range(new_pp_degree): + new_2d_map.add_data(j, i, split_ranks[j]) + + return new_2d_map + + +def reshape_meg_2d_parallel(old_pp_degree, + old_tp_degree, + new_pp_degree, + new_tp_degree, + verbose=False): + assert new_pp_degree <= old_pp_degree + assert new_tp_degree <= old_tp_degree + + old_2d_map = meg_2d_parallel_map(old_pp_degree, old_tp_degree) + old_2d_map.simple_init() + if verbose: + old_2d_map.print_data(f'original_2d_map:') + + if old_tp_degree != new_tp_degree: + new_tp_map = _reshape_tp_dimension(old_2d_map, new_tp_degree) + else: + new_tp_map = old_2d_map + if verbose: + new_tp_map.print_data(f'after_tp_reshape:') + + if old_pp_degree != new_pp_degree: + final_map = _reshape_pp_dimension(new_tp_map, new_pp_degree) + else: + final_map = new_tp_map + + if verbose: + final_map.print_data(f'final_2d_map:') + + return final_map diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index d9a30f468..51229819f 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -27,7 +27,8 @@ mpu, print_rank_0, update_num_microbatches, - utils) + utils, + get_tokenizer) from megatron.enums import PositionEmbeddingType _CHECKPOINT_VERSION = None @@ -131,6 +132,7 @@ def save_checkpoint(iteration, model, optimizer, lr_scheduler): state_dict['checkpoint_version'] = 3.0 state_dict['iteration'] = iteration state_dict['tokens'] = args.consumed_train_tokens + state_dict['checkpoint_info'] = _checkpoint_info() # DeepSpeed saves the model/optimizer/scheduler if not args.deepspeed: @@ -468,3 +470,13 @@ def load_biencoder_checkpoint(model, only_query_model=False, print(' successfully loaded {}'.format(checkpoint_name)) return model + + +def _checkpoint_info(): + args = get_args() + tokenizer = get_tokenizer() + + return { + "padded_vocab_size": args.padded_vocab_size, + "original_vocab_size": tokenizer.vocab_size, + } \ No newline at end of file diff --git a/tools/convert_checkpoint/deepspeed_checkpoint.py b/tools/convert_checkpoint/deepspeed_checkpoint.py deleted file mode 100644 index c38e0d550..000000000 --- a/tools/convert_checkpoint/deepspeed_checkpoint.py +++ /dev/null @@ -1,189 +0,0 @@ -import os -from typing import Dict -import torch - -ZERO_FILE_PREFIX = 'zero_pp_rank_' -LAYER_FILE_PREFIX = 'layer_' -MP_RANK_FILE_PREFIX = 'mp_rank_' -EMBEDDING_LAYER_INDEX = 0 -FINAL_LAYER_NORM_INDEX = -1 -ARGS_KEY = 'args' -ITERATION_KEY = 'iteration' -SEQUENTIAL_LAYERS = [ - 'input_layernorm.weight', 'input_layernorm.bias', - 'self_attention.dense.bias', - 'post_attention_layernorm.weight', 'post_attention_layernorm.bias', - 'mlp.dense_4h_to_h.bias', - 'position_embeddings.weight' -] - -LAYER_CONCAT_DIM = { - 'self_attention.dense.weight': 1, - 'mlp.dense_4h_to_h.weight': 1 -} - -class DeepSpeedCheckpoint(object): - def __init__(self, dir, tp_degree=None, pp_degree=None): - self.dir = dir - self.file_list = self._get_files(dir) - self.zero_files = self._get_files_with_prefix(self.file_list, ZERO_FILE_PREFIX) - self.layer_files = self._get_files_with_prefix(self.file_list, LAYER_FILE_PREFIX) - self.mp_rank_files = self._get_files_with_prefix(self.file_list, MP_RANK_FILE_PREFIX) - self.layer_keys = self._get_layer_keys() - self.layer_count = len(self.layer_keys) - self.original_tp_degree = len(self._get_files_with_prefix(self.layer_files, f'{LAYER_FILE_PREFIX}01')) - self.original_pp_degree = len(self.mp_rank_files) // self.original_tp_degree - self.dp_degree = len(self.zero_files) // (self.original_pp_degree * self.original_tp_degree) - self.tp_degree = self.original_tp_degree if tp_degree is None else tp_degree - self.pp_degree = self.original_pp_degree if pp_degree is None else pp_degree - self.global_state = {} - - self._sanity_check() - self.pp_to_transformer_map = self._build_pp_transformer_map() - self.transformer_file_map = self._build_transformer_file_map() - self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX) - self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX) - self._build_global_state() - - - - def show_tp_embedding_map(self): - self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers') - - def show_tp_final_norm_map(self): - self._dump_mapping(self.tp_to_final_norm_map, 'tp_to_final_norm_layers') - - def show_pp_tranformer_map(self): - self._dump_mapping(self.pp_to_transformer_map, 'pp_to_tranformer_layers') - - def show_transformer_file_map(self): - self._dump_mapping(self.transformer_file_map, 'rank_to_tranformer_files') - - def _build_global_state(self): - sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu')) - self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) - self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None) - - def get_iteration(self): - if not ITERATION_KEY in self.global_state: - sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu')) - self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) - - return self.global_state[ITERATION_KEY] - - def get_embedding_state(self, tp_index: int) -> Dict: - assert tp_index in self.tp_to_embedding_map.keys() - sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]] - sd = self._merge_state_dicts(sd_list) - return sd - - def get_args(self): - if not ARGS_KEY in self.global_state: - sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu')) - self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None) - - return self.global_state[ARGS_KEY] - - - def get_transformer_state(self, tp_index: int, pp_index: int) -> list: - assert tp_index < self.tp_degree - assert pp_index < self.pp_degree - t_list = [] - for fname_list in self.transformer_file_map[(tp_index, pp_index)]: - sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list] - sd = self._merge_state_dicts(sd_list) - t_list.append(sd) - return t_list - - def get_final_norm_state(self, tp_index:int) -> Dict: - assert tp_index in self.tp_to_final_norm_map.keys() - sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu')) - return sd - - def _build_tp_other_layer_map(self, layer_index:int): - assert layer_index < len(self.layer_files) - layer_files = self._get_files_with_prefix(self.layer_files, self.layer_keys[layer_index]) - layer_file_partitions = self._partition_data(layer_files, self.tp_degree) - data_map = {i:flist for i, flist in enumerate(layer_file_partitions)} - return data_map - - def _build_pp_transformer_map(self): - data_map = {} - transformer_layers = self.layer_keys[1:-1] - layers_per_pp = len(transformer_layers) // self.pp_degree - data_map = {i:transformer_layers[i*layers_per_pp:(i+1)*layers_per_pp] for i in range(0, self.pp_degree)} - return data_map - - def _dump_mapping(self, data_map, map_tag = None): - if map_tag is not None: - print(f'Dump mapping: {map_tag}') - for k, v in data_map.items(): - print(f'{k} = {v}') - - def _build_transformer_file_map(self): - transformer_layer_keys = self.layer_keys[1:-1] - file_map = {} - layers_per_pp = len(transformer_layer_keys) // self.pp_degree - for key_index, layer_key in enumerate(transformer_layer_keys): - pp_index = key_index // layers_per_pp - layer_files = self._get_files_with_prefix(self.layer_files, layer_key) - layer_file_partitions = self._partition_data(layer_files, self.tp_degree) - for tp_index in range(self.tp_degree): - map_key = (tp_index, pp_index) - if not map_key in file_map.keys(): - file_map[map_key] = [] - file_map[map_key].append(layer_file_partitions[tp_index]) - - return file_map - - def _sanity_check(self): - assert len(self.mp_rank_files) % self.tp_degree == 0 - assert len(self.zero_files) % (self.pp_degree * self.tp_degree) == 0 - assert len(self.layer_keys) > 2 - assert (len(self.layer_keys) - 2) % self.pp_degree == 0 - - def _get_files_with_prefix(self, all_files, prefix): - file_list = [] - for file_path in all_files: - _, fname = os.path.split(file_path) - if fname.startswith(prefix): - file_list.append(file_path) - - return sorted(file_list) - - def validate_files(self): - for file in self.file_list: - if not os.path.isfile(file): - print(f'Error: {file} is not existent') - - def _get_files(self, dir): - file_list = [] - for root, dirs, files in os.walk(dir): - for file in files: - file_list.append(os.path.join(root, file)) - return file_list - - def _get_layer_keys(self): - key_set = set() - key_len = len(LAYER_FILE_PREFIX) + 2 - for file_path in self.layer_files: - _, fname = os.path.split(file_path) - key_set.add(fname[:key_len]) - return sorted(list(key_set)) - - def _partition_data(self, data_list, num_partitions): - num_elems = len(data_list) - assert num_elems % num_partitions == 0 - partition_size = num_elems // num_partitions - partitions_list = [data_list[i:i+partition_size] for i in range(0, num_elems, partition_size)] - return partitions_list - - def _merge_state_dicts(self, sd_list): - merged_sd = {} - for key in sd_list[0].keys(): - if not key in SEQUENTIAL_LAYERS: - cat_dim = LAYER_CONCAT_DIM.get(key, 0) - merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim) - else: - merged_sd[key] = sd_list[0][key] - return merged_sd diff --git a/tools/convert_checkpoint/deepspeed_to_deepspeed.py b/tools/convert_checkpoint/deepspeed_to_deepspeed.py new file mode 100644 index 000000000..f281f0365 --- /dev/null +++ b/tools/convert_checkpoint/deepspeed_to_deepspeed.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +import sys +import argparse +import os +import torch +from pathlib import Path + +# insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[2]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) + +from megatron.checkpoint.deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint, MP_RANK_FILE_PREFIX, PADDED_VOCAB_SIZE, CHECKPOINT_INFO_KEY + +CHECKPOINT_FILE_SUFFIX = '_model_states.pt' +MP_WORLD_SIZE ='mp_world_size' + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--input_folder', + default=None, + type=str, + help='Input DeepSpeed Checkpoint folder') + parser.add_argument('--output_folder', + default=None, + type=str, + help='Output Megatron checkpoint folder') + parser.add_argument('--target_tp', + default=None, + type=int, + help='Target TP degree') + parser.add_argument('--target_pp', + default=None, + type=int, + help='Target PP degree') + args = parser.parse_args() + print(f'args = {args}') + return args + + +def _create_layer_checkpoint_path(base_folder, tp_rank, layer_id): + ckpt_file = f'{layer_id}-model_{tp_rank:02d}{CHECKPOINT_FILE_SUFFIX}' + ckpt_path = os.path.join(base_folder, ckpt_file) + return ckpt_path + + +def _create_2d_checkpoint_path(base_folder, file_index): + ckpt_file = f'{MP_RANK_FILE_PREFIX}{file_index:02d}{CHECKPOINT_FILE_SUFFIX}' + ckpt_path = os.path.join(base_folder, ckpt_file) + return ckpt_path + + +def _save_checkpoint(file_path, chkpt_sd): + dir, _ = os.path.split(file_path) + os.makedirs(dir, exist_ok=True) + torch.save(chkpt_sd, file_path) + + +def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, + pp_index): + sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index) + layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index) + assert len(sd_list) == len(layer_id_list) + for sd, layer_id in zip(sd_list, layer_id_list): + ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, + layer_id) + _save_checkpoint(ckpt_path, sd) + + +def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index): + sd = ds_checkpoint.get_embedding_state(tp_index) + layer_id = ds_checkpoint.get_embedding_layer_id() + ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, layer_id) + _save_checkpoint(ckpt_path, sd) + + +def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index): + sd = ds_checkpoint.get_final_norm_state(tp_index) + layer_id = ds_checkpoint.get_final_norm_layer_id() + ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, layer_id) + _save_checkpoint(ckpt_path, sd) + + +def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index, + pp_index): + sd = ds_checkpoint.get_2d_parallel_state(tp_index=tp_index, + pp_index=pp_index) + sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree + file_id = pp_index * ds_checkpoint.tp_degree + tp_index + ckpt_path = _create_2d_checkpoint_path(base_folder, file_id) + + # Adjust specific fields + sd[ARGS_KEY] = ds_checkpoint.get_args() + sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree + sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree + sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size + + _save_checkpoint(ckpt_path, sd) + + +def _create_latest_file(base_folder, file_name, latest_tag): + file_path = os.path.join(base_folder, file_name) + os.makedirs(base_folder, exist_ok=True) + with open(file_path, 'w') as f: + f.write(str(latest_tag)) + + +def main(): + print(f'Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint') + + args = parse_arguments() + print( + f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}' + ) + + ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, + args.target_pp) + iteration = ds_checkpoint.get_iteration() + latest_tag = f'global_step{iteration}' + _create_latest_file(args.output_folder, + 'latest_checkpointed_iteration.txt', iteration) + _create_latest_file(args.output_folder, 'latest', latest_tag) + base_folder = os.path.join(args.output_folder, latest_tag) + + for i in range(ds_checkpoint.tp_degree): + _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i) + _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i) + for j in range(ds_checkpoint.pp_degree): + _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, + j) + _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j) + + +if __name__ == "__main__": + main() diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py index 022759372..bb8a94d32 100755 --- a/tools/convert_checkpoint/deepspeed_to_megatron.py +++ b/tools/convert_checkpoint/deepspeed_to_megatron.py @@ -4,7 +4,16 @@ import os import torch from collections import OrderedDict -from deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint +import sys +from pathlib import Path + +# insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[2]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) + + +from megatron.checkpoint.deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint MODEL_KEY = 'model' ARGS_KEY = 'args' @@ -13,18 +22,34 @@ ENCODER_KEY = 'encoder' WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head' WORD_EMBEDDINGS_KEY = 'word_embeddings' -FINAL_LAYER_NORM_KEY ='final_layernorm' +FINAL_LAYER_NORM_KEY = 'final_layernorm' CHECKPOINT_VERSION_KEY = 'checkpoint_version' CHECKPOINT_VERSION_VALUE = 3.0 ITERATION_KEY = 'iteration' + def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument('--input_folder', default=None, type=str, help='Input DeepSpeed Checkpoint folder') - parser.add_argument('--output_folder', default=None, type=str, help='Output Megatron checkpoint folder') - parser.add_argument('--target_tp', default=1, type=int, help='Target TP degree') - parser.add_argument('--target_pp', default=1, type=int, help='Target PP degree') - parser.add_argument('--for_release', action='store_true', help='Convert for release purpose, reset some (progress) counters.') + parser.add_argument('--input_folder', + default=None, + type=str, + help='Input DeepSpeed Checkpoint folder') + parser.add_argument('--output_folder', + default=None, + type=str, + help='Output Megatron checkpoint folder') + parser.add_argument('--target_tp', + default=1, + type=int, + help='Target TP degree') + parser.add_argument('--target_pp', + default=1, + type=int, + help='Target PP degree') + parser.add_argument( + '--for_release', + action='store_true', + help='Convert for release purpose, reset some (progress) counters.') args = parser.parse_args() print(f'args = {args}') return args @@ -39,6 +64,7 @@ def _convert_ds_transformer_state(sd_list): return new_sd + def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree): path_list = [] iter_folder = f'iter_{iteration:07d}' @@ -47,18 +73,18 @@ def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree): for j in range(0, pp_degree): rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}' ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt') - path_list[i].append(os.path.join(base_folder, iter_folder, ckpt_path)) + path_list[i].append( + os.path.join(base_folder, iter_folder, ckpt_path)) return path_list def _create_megatron_dict(): - language_model_dict = { - EMBEDDING_KEY: {}, - ENCODER_KEY: {} - } + language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}} megatron_dict = { - MODEL_KEY: {LANGUGAGE_MODEL_KEY: language_model_dict}, + MODEL_KEY: { + LANGUGAGE_MODEL_KEY: language_model_dict + }, CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE } return megatron_dict @@ -78,7 +104,11 @@ def _renest_sd(sd): return new_sd -def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index, for_release=False): +def _create_rank_checkpoint(ds_checkpoint, + checkpoint_path, + tp_index, + pp_index, + for_release=False): meg_encoder_sd = OrderedDict() meg_embedding_sd = OrderedDict() meg_embedding_for_head_sd = OrderedDict() @@ -92,7 +122,7 @@ def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index, if pp_index == 0: meg_embedding_sd.update(nested_embedding_sd) - if pp_index == ds_checkpoint.pp_degree -1: + if pp_index == ds_checkpoint.pp_degree - 1: for key, value in embedding_sd.items(): if key.startswith(WORD_EMBEDDINGS_KEY): fields = key.split('.') @@ -101,7 +131,10 @@ def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index, meg_embedding_for_head_sd[new_key] = value final_norm_sd = ds_checkpoint.get_final_norm_state(tp_index) - new_final_norm_sd = {f'{FINAL_LAYER_NORM_KEY}.{key}': value for key, value in final_norm_sd.items()} + new_final_norm_sd = { + f'{FINAL_LAYER_NORM_KEY}.{key}': value + for key, value in final_norm_sd.items() + } meg_encoder_sd.update(new_final_norm_sd) checkpoint_sd = _create_megatron_dict() @@ -109,15 +142,19 @@ def _create_rank_checkpoint(ds_checkpoint, checkpoint_path, tp_index, pp_index, iteration = ds_checkpoint.get_iteration() checkpoint_sd[ITERATION_KEY] = iteration if pp_index == 0: - checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][EMBEDDING_KEY] = meg_embedding_sd + checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ + EMBEDDING_KEY] = meg_embedding_sd checkpoint_sd[MODEL_KEY][LANGUGAGE_MODEL_KEY][ENCODER_KEY] = meg_encoder_sd - if pp_index == ds_checkpoint.pp_degree -1: - checkpoint_sd[MODEL_KEY][WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd + if pp_index == ds_checkpoint.pp_degree - 1: + checkpoint_sd[MODEL_KEY][ + WORD_EMBEDDINGS_FOR_HEAD_KEY] = meg_embedding_for_head_sd checkpoint_sd[ARGS_KEY] = ds_checkpoint.get_args() # Adjust specific fields - checkpoint_sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree - checkpoint_sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree + checkpoint_sd[ + ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree + checkpoint_sd[ + ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree if for_release: checkpoint_sd[ARGS_KEY].consumed_train_samples = 0 checkpoint_sd[ARGS_KEY].consumed_valid_samples = 0 @@ -131,20 +168,27 @@ def _create_latest_file(base_folder, iteration): with open(file_path, 'w') as f: f.write(str(iteration)) + def main(): print(f'Convert DeepSpeed Checkpoint to Megatron Checkpoint') args = parse_arguments() - print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}') + print( + f'Converting DeepSpeed checkpoint in {args.input_folder} to Megatron checkpoint in {args.output_folder}' + ) - ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, args.target_pp) + ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, + args.target_pp) iteration = ds_checkpoint.get_iteration() _create_latest_file(args.output_folder, iteration) - checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree) + checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, + ds_checkpoint.tp_degree, + ds_checkpoint.pp_degree) for i in range(0, ds_checkpoint.tp_degree): for j in range(0, ds_checkpoint.pp_degree): sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release) _save_checkpoint(checkpoint_paths[i][j], sd) + if __name__ == "__main__": main() diff --git a/tools/convert_checkpoint/deepspeed_to_transformers.py b/tools/convert_checkpoint/deepspeed_to_transformers.py index 667695026..3f1f00d5e 100755 --- a/tools/convert_checkpoint/deepspeed_to_transformers.py +++ b/tools/convert_checkpoint/deepspeed_to_transformers.py @@ -3,31 +3,37 @@ import os import torch import json - -from deepspeed_checkpoint import DeepSpeedCheckpoint +import sys +from pathlib import Path + +# insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[2]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) + +from megatron.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments # the import was tested to work with this version # https://github.com/huggingface/transformers/commit/0af901e83 if it diverges we may consider # copying that version here instead -from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import ( - convert_megatron_checkpoint, -) -from transformers import GPT2Config, AutoTokenizer +from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint import convert_megatron_checkpoint +from transformers import GPT2Config def main(): + # this first part comes mainly from deepspeed_to_megatron.main args = parse_arguments() print( - f"Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}" + f'Converting DeepSpeed checkpoint in {args.input_folder} to HF Transformers checkpoint in {args.output_folder}' ) - ds_checkpoint = DeepSpeedCheckpoint( - args.input_folder, args.target_tp, args.target_pp - ) - ds_args = ds_checkpoint.get_args() - input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, args.for_release) + ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, + args.target_pp) + iteration = ds_checkpoint.get_iteration() + input_state_dict = _create_rank_checkpoint(ds_checkpoint, 0, 0, + args.for_release) # the 2nd part comes from transformers.models.megatron_gpt2.convert_megatron_gpt2_checkpoint.main # Spell out all parameters in case the defaults change. @@ -59,13 +65,14 @@ def main(): # Convert. print("Converting to HF Checkpoint") - output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) + output_state_dict = convert_megatron_checkpoint(args, input_state_dict, + config) basename = args.output_folder os.makedirs(basename, exist_ok=True) # Print the structure of converted state dict. - # if args.print_checkpoint_structure: + #if args.print_checkpoint_structure: # recursive_print(None, output_state_dict) # Store the config to file. @@ -73,20 +80,6 @@ def main(): output_config = config.to_dict() output_config["architectures"] = ["GPT2LMHeadModel"] output_config["model_type"] = "gpt2" - - # Add tokenizer class info to config.json - # see https://github.com/huggingface/transformers/issues/13906) - tokenizer_type = ds_args.tokenizer_type - if tokenizer_type == "GPT2BPETokenizer": - tokenizer_model_name = "gpt2" - elif tokenizer_type == "PretrainedFromHF": - tokenizer_model_name = ds_args.tokenizer_name_or_path - else: - raise ValueError(f"Unrecognized tokenizer_type {tokenizer_type}") - tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name) - tokenizer_class = type(tokenizer).__name__ - output_config["tokenizer_class"] = tokenizer_class - print(f'Saving config to "{output_config_file}"') with open(output_config_file, "w") as f: json.dump(output_config, f) @@ -96,9 +89,7 @@ def main(): print(f'Saving checkpoint to "{output_checkpoint_file}"') torch.save(output_state_dict, output_checkpoint_file) - # Save tokenizer based on args - print(f"Adding {tokenizer_class} tokenizer files") - tokenizer.save_pretrained(basename) + print("Now add tokenizer files and upload to the hub") if __name__ == "__main__": diff --git a/tools/convert_checkpoint/inspect_checkpoint.py b/tools/convert_checkpoint/inspect_checkpoint.py index 5ee955bb4..82d154111 100644 --- a/tools/convert_checkpoint/inspect_checkpoint.py +++ b/tools/convert_checkpoint/inspect_checkpoint.py @@ -1,13 +1,19 @@ -import torch import sys +import torch import os from collections import OrderedDict +from pathlib import Path + +# insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[2]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) def dump_data(datum, name_list=[]): if type(datum) in (dict, OrderedDict): for k, v in datum.items(): - dump_data(v, name_list+[str(k)]) + dump_data(v, name_list + [str(k)]) elif type(datum) in (list, tuple): for v in datum: dump_data(v, name_list) @@ -15,10 +21,11 @@ def dump_data(datum, name_list=[]): prefix = '.'.join(name_list) print(f'[tensor] {prefix} = {datum.shape}') else: - #pass + #pass prefix = '.'.join(name_list) print(f'[other] {prefix} = {datum}') + def main(): if len(sys.argv) < 2: print(f'Usage: {sys.argv[0]} ') diff --git a/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py b/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py index 3125f7d9a..a3efa1c85 100644 --- a/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py +++ b/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py @@ -1,19 +1,40 @@ +import sys +from pathlib import Path + +# insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[2]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) + import argparse -from deepspeed_checkpoint import DeepSpeedCheckpoint + +from megatron.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint +from megatron.checkpoint.reshape_meg_2d import reshape_meg_2d_parallel + def list_files(file_list, tag): print(f'Listing files: {tag}') for i, file in enumerate(file_list): print(f'{i+1}: {file}') + def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument('--folder', default=None, type=str, help='DeepSpeed Checkpoint folder') - parser.add_argument('--target_tp', default=None, type=int, help='Target TP degree') - parser.add_argument('--target_pp', default=None, type=int, help='Target PP degree') + parser.add_argument('--folder', + default=None, + type=str, + help='DeepSpeed Checkpoint folder') + parser.add_argument('--target_tp', + default=None, + type=int, + help='Target TP degree') + parser.add_argument('--target_pp', + default=None, + type=int, + help='Target PP degree') args = parser.parse_args() print(f'args = {args}') - return args + return args def show_input_files(ds_checkpoint): @@ -22,38 +43,52 @@ def show_input_files(ds_checkpoint): list_files(ds_checkpoint.layer_files, 'layer') list_files(ds_checkpoint.mp_rank_files, 'mp rank') + def show_simple_state(ds_checkpoint): print(f'layer keys = {ds_checkpoint.layer_keys}') print(f'layer count = {ds_checkpoint.layer_count}') - print(f'tp_degree_count = {ds_checkpoint.tp_degree}') - print(f'pp_degree_count = {ds_checkpoint.pp_degree}') + print( + f'tp_degree_count = {ds_checkpoint.original_tp_degree} ------> {ds_checkpoint.tp_degree}' + ) + print( + f'pp_degree_count = {ds_checkpoint.original_pp_degree} ------> {ds_checkpoint.pp_degree}' + ) print(f'dp_degree_count = {ds_checkpoint.dp_degree}') + ds_checkpoint.old_2d_map.print_data('old 2d map ==>') + ds_checkpoint.new_2d_map.print_data('new 2d map ==>') + def show_mappings(ds_checkpoint): ds_checkpoint.show_pp_tranformer_map() ds_checkpoint.show_transformer_file_map() ds_checkpoint.show_tp_embedding_map() ds_checkpoint.show_tp_final_norm_map() + ds_checkpoint.show_2d_mapping() + def show_state_summary(tag, sd): - summary = {k:v.shape for k,v in sd.items()} + summary = {k: v.shape for k, v in sd.items()} print(f'{tag} = {summary}') + def show_embedding_states(ds_checkpoint): for i in range(0, ds_checkpoint.tp_degree): sd = ds_checkpoint.get_embedding_state(i) show_state_summary(f'embedding[{i}]', sd) + def show_final_norm_states(ds_checkpoint): for i in range(0, ds_checkpoint.tp_degree): sd = ds_checkpoint.get_final_norm_state(i) show_state_summary(f'final_norm[{i}]', sd) + def show_transformer_states(ds_checkpoint): for i in range(0, ds_checkpoint.tp_degree): for j in range(0, ds_checkpoint.pp_degree): - state_list = ds_checkpoint.get_transformer_state(tp_index=i, pp_index=j) + state_list = ds_checkpoint.get_transformer_state(tp_index=i, + pp_index=j) print(f'tp_pp_rank[{i},{j}] = ') for k, sd in enumerate(state_list): show_state_summary(f' block[{k}]', sd) @@ -64,9 +99,11 @@ def main(): print(f'Inspecting DeepSpeed Checkpoint') args = parse_arguments() - ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp, args.target_pp) + ds_checkpoint = DeepSpeedCheckpoint(args.folder, args.target_tp, + args.target_pp) ds_checkpoint.validate_files() - + + show_simple_state(ds_checkpoint) show_input_files(ds_checkpoint) show_simple_state(ds_checkpoint) show_mappings(ds_checkpoint) @@ -76,5 +113,6 @@ def main(): checkpoint_args = ds_checkpoint.get_args() print(f'checkpoint args = {checkpoint_args}') + if __name__ == "__main__": main() From 675f12ca12459d1554b4bfdf8a8871e8e4c7c083 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 24 Jan 2022 18:22:16 -0800 Subject: [PATCH 02/33] add checkpoint tests --- tests/test_checkpoints.py | 210 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) create mode 100644 tests/test_checkpoints.py diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py new file mode 100644 index 000000000..10ea28dfe --- /dev/null +++ b/tests/test_checkpoints.py @@ -0,0 +1,210 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import os +from pathlib import Path + +from megatron.testing_utils import ( + CaptureStdout, + TestCasePlus, + execute_subprocess_async, + get_gpu_count, + require_deepspeed, + require_torch_gpu, + require_torch_multi_gpu, + set_seed +) + +set_seed(42) + + +def get_launcher(num_gpus): + # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup + # - it won't be able to handle that + return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() + +@require_deepspeed +@require_torch_gpu +class MegDSTestCheckpoints(TestCasePlus): + """ """ + + def setUp(self): + super().setUp() + + # at times magatron fails to build kernels and doesn't remove the lock file, which makes + # subsequent runs hang - so make sure there is no lock when starting the testing + meg_lock_file_path = self.repo_root_dir_str + "/megatron/fused_kernels/build/lock" + if os.path.exists(meg_lock_file_path): + os.unlink(meg_lock_file_path) + + def get_config(self, output_dir, tp_size, pp_size, dp_size): + data_dir = f"{self.data_dir}/gpt2" + + num_gpus = pp_size * tp_size * dp_size + print(f"Using {num_gpus} GPUs") + + n_samples = 300 # about 56 iterations + + exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume + seq_len = 128 + + # common/shared configs + + ds_args = f""" + --deepspeed + --deepspeed_config {self.test_file_dir_str}/ds_config.json + --zero-stage 1 + --deepspeed-activation-checkpointing + """.split() + + args = f""" + --tensor-model-parallel-size {tp_size} + --pipeline-model-parallel-size {pp_size} + --distributed-backend nccl + + --log-interval 1 + --save-interval 20 + --eval-interval 10 + --eval-iters 5 + --checkpoint-activations + --partition-activations + --exit-interval {exit_interval} + + --merge-file {data_dir}/gpt2-tiny-merges.txt + --vocab-file {data_dir}/gpt2-tiny-vocab.json + --save {output_dir}/checkpoints + --load {output_dir}/checkpoints + --data-path {data_dir}/meg-gpt2-openwebtext_text_document + --tensorboard-dir {output_dir}/tensorboard + --tensorboard-queue-size 5 + --log-timers-to-tensorboard + --log-batch-size-to-tensorboard + --log-validation-ppl-to-tensorboard + + --num-layers 2 + --hidden-size 64 + --num-attention-heads 2 + --seq-length {seq_len} + --max-position-embeddings 1024 + --micro-batch-size 1 + --global-batch-size 16 + --rampup-batch-size 2 2 {n_samples} + --train-samples {n_samples} + + --optimizer adam + --adam-beta1 0.9 + --adam-beta2 0.95 + --adam-eps 1e-8 + --lr 1e-4 + --lr-warmup-samples 5 + --lr-decay-samples 6 + --clip-grad 1.0 + --weight-decay 1e-1 + --fp16 + + --log-level debug + --log-level-replica info + """.split() + + # XXX: fails to handle: + #--embed-layernorm + # +# stderr: RuntimeError: Error(s) in loading state_dict for VocabParallelEmbedding: +# stderr: size mismatch for norm.weight: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]). +# stderr: size mismatch for norm.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([64]). + + return args, ds_args, num_gpus + + + def train_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1): + src_dir = self.src_dir + script = [f"{src_dir}/pretrain_gpt.py"] + + args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size) + launcher = get_launcher(num_gpus) + cmd = launcher + script + args + ds_args + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + + # 1. test training from scratch (no checkpoint) + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test deepspeed is running + self.assertIn("DeepSpeed info", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) + + # test there should be no checkpoint this round + self.assertIn(f"Unable to find latest file at {output_dir}/checkpoints/latest", cs.out) + + # test checkpoint saving + self.assertIn("successfully saved checkpoint at iteration", cs.out) + + + def resume_from_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1): + src_dir = self.src_dir + script = [f"{src_dir}/pretrain_gpt.py"] + + args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size) + launcher = get_launcher(num_gpus) + cmd = launcher + script + args + ds_args + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + # test checkpoint loading + self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) + + # test checkpoint saving + self.assertIn("successfully saved checkpoint at iteration", cs.out) + + + def reshape_checkpoint(self, input_dir, output_dir, target_tp_size, target_pp_size): + cmd = f""" + python tools/convert_checkpoint/deepspeed_to_deepspeed.py + --input_folder {input_dir}/checkpoints/global_step20 + --output_folder {output_dir}/checkpoints + --target_tp {target_tp_size} --target_pp {target_pp_size} + """.split() + + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + self.assertIn("Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint", cs.out) + + + + @require_torch_multi_gpu + def test_checkpoint_reshaping(self): + # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus + + output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + + # 1. train with TP=2 / PP=1 + self.train_checkpoint(output_dir1, tp_size=2, pp_size=1, dp_size=1) + + # 2. convert checkpoint to TP=1 / PP=1 + self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + + # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 + self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) From e379065b6e0a48c1246c7eec5e6ce3ba280eaa5b Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 25 Jan 2022 18:07:45 +0000 Subject: [PATCH 03/33] Validate input folder --- megatron/checkpoint/deepspeed_checkpoint.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/megatron/checkpoint/deepspeed_checkpoint.py b/megatron/checkpoint/deepspeed_checkpoint.py index d2a38aae0..33751fe52 100644 --- a/megatron/checkpoint/deepspeed_checkpoint.py +++ b/megatron/checkpoint/deepspeed_checkpoint.py @@ -32,6 +32,8 @@ class DeepSpeedCheckpoint(object): def __init__(self, dir, tp_degree=None, pp_degree=None): self.dir = dir + self._validate_folder(dir) + self.file_list = get_files(dir) self.zero_files = self._get_files_with_prefix(self.file_list, ZERO_FILE_PREFIX) @@ -39,6 +41,7 @@ def __init__(self, dir, tp_degree=None, pp_degree=None): LAYER_FILE_PREFIX) self.mp_rank_files = self._get_files_with_prefix( self.file_list, MP_RANK_FILE_PREFIX) + self.layer_keys = self._get_layer_keys() self.layer_count = len(self.layer_keys) self.original_tp_degree = len( @@ -276,3 +279,14 @@ def _strip_vocab_padding(self, padded_vocab_tensor): unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size) return unpadded_vocab_tensor.clone() + + def _validate_folder(self, dir): + assert os.path.exists(dir), f'{dir} path does not exist' + assert os.path.isdir(dir), f'{dir} is not a folder' + + file_list = get_files(dir) + + for file_prefix in [MP_RANK_FILE_PREFIX, LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01']: + ckpt_files = self._get_files_with_prefix(file_list, file_prefix) + assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.' + From a1068e4d45d5a5133415c910e616890df95907e7 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 25 Jan 2022 23:04:45 +0000 Subject: [PATCH 04/33] Tests for tp/pp reshape --- megatron/checkpoint/deepspeed_checkpoint.py | 2 + tests/test_checkpoints.py | 48 ++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/megatron/checkpoint/deepspeed_checkpoint.py b/megatron/checkpoint/deepspeed_checkpoint.py index 33751fe52..0067b7dd8 100644 --- a/megatron/checkpoint/deepspeed_checkpoint.py +++ b/megatron/checkpoint/deepspeed_checkpoint.py @@ -243,11 +243,13 @@ def _get_files_with_prefix(self, all_files, prefix): return sorted(file_list) + def validate_files(self): for file in self.file_list: if not os.path.isfile(file): print(f'Error: {file} is not existent') + def _get_layer_keys(self): key_set = set() key_len = len(LAYER_FILE_PREFIX) + 2 diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 10ea28dfe..107470527 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -194,7 +194,7 @@ def reshape_checkpoint(self, input_dir, output_dir, target_tp_size, target_pp_si @require_torch_multi_gpu - def test_checkpoint_reshaping(self): + def test_checkpoint_reshaping_tp2_pp1_dp1(self): # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) @@ -208,3 +208,49 @@ def test_checkpoint_reshaping(self): # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) + + + @require_torch_multi_gpu + def test_checkpoint_reshaping_tp2_pp2_dp1(self): + # this test requires at least 4 gpus - will use only 2 gpus for now - XXX: extend to more gpus + + output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + + # 1. train with TP=2 / PP=2 + self.train_checkpoint(output_dir1, tp_size=2, pp_size=2, dp_size=1) + + # 2. convert checkpoint to TP=1 / PP=1 + self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + + # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 + self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) + + + @require_torch_multi_gpu + def test_checkpoint_reshaping_tp1_pp2_dp1(self): + # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus + + output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + + # 1. train with TP=1 / PP=2 + self.train_checkpoint(output_dir1, tp_size=1, pp_size=2, dp_size=1) + + # 2. convert checkpoint to TP=1 / PP=1 + with self.assertRaises(AssertionError) as context: + self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + + # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 + self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) + + + @require_torch_multi_gpu + def test_checkpoint_reshaping_empty_dir(self): + # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus + + output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + with self.assertRaises(AssertionError) as context: + self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + From 115bd313868e11c12e25c200df222e2928ff6154 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 25 Jan 2022 15:36:00 -0800 Subject: [PATCH 05/33] remove debug folders --- tests/test_checkpoints.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 107470527..f487fc87f 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -197,8 +197,8 @@ def reshape_checkpoint(self, input_dir, output_dir, target_tp_size, target_pp_si def test_checkpoint_reshaping_tp2_pp1_dp1(self): # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus - output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) # 1. train with TP=2 / PP=1 self.train_checkpoint(output_dir1, tp_size=2, pp_size=1, dp_size=1) @@ -214,8 +214,8 @@ def test_checkpoint_reshaping_tp2_pp1_dp1(self): def test_checkpoint_reshaping_tp2_pp2_dp1(self): # this test requires at least 4 gpus - will use only 2 gpus for now - XXX: extend to more gpus - output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) # 1. train with TP=2 / PP=2 self.train_checkpoint(output_dir1, tp_size=2, pp_size=2, dp_size=1) @@ -231,8 +231,8 @@ def test_checkpoint_reshaping_tp2_pp2_dp1(self): def test_checkpoint_reshaping_tp1_pp2_dp1(self): # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus - output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) # 1. train with TP=1 / PP=2 self.train_checkpoint(output_dir1, tp_size=1, pp_size=2, dp_size=1) @@ -249,8 +249,7 @@ def test_checkpoint_reshaping_tp1_pp2_dp1(self): def test_checkpoint_reshaping_empty_dir(self): # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus - output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) with self.assertRaises(AssertionError) as context: self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) - From cc2fad1fb78c556eae47eaba1de33db6eed118f0 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 25 Jan 2022 15:44:51 -0800 Subject: [PATCH 06/33] fix test_checkpoint_reshaping_empty_dir --- tests/test_checkpoints.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index f487fc87f..86fb4e006 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -251,5 +251,5 @@ def test_checkpoint_reshaping_empty_dir(self): output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) - with self.assertRaises(AssertionError) as context: - self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + with self.assertRaises(RuntimeError) as context: + self.reshape_checkpoint(input_dir=output_dir1+"/xyz", output_dir=output_dir2, target_tp_size=1, target_pp_size=1) From b6733d5791b37a95e77e818282b3ba25cfd036df Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 26 Jan 2022 19:15:20 +0000 Subject: [PATCH 07/33] Fix unit tests --- tests/test_checkpoints.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 86fb4e006..d3be67234 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -238,8 +238,7 @@ def test_checkpoint_reshaping_tp1_pp2_dp1(self): self.train_checkpoint(output_dir1, tp_size=1, pp_size=2, dp_size=1) # 2. convert checkpoint to TP=1 / PP=1 - with self.assertRaises(AssertionError) as context: - self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) From 9bf7ac51f7b42119d0e403f421f9e0d6363c0a15 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 7 Feb 2022 19:09:20 +0000 Subject: [PATCH 08/33] Remove deepspeed checkpoint utils --- megatron/checkpoint/__init__.py | 0 megatron/checkpoint/conversion_utils.py | 26 -- megatron/checkpoint/deepspeed_checkpoint.py | 294 ------------------ megatron/checkpoint/reshape_meg_2d.py | 104 ------- .../deepspeed_to_deepspeed.py | 66 +++- 5 files changed, 60 insertions(+), 430 deletions(-) delete mode 100644 megatron/checkpoint/__init__.py delete mode 100644 megatron/checkpoint/conversion_utils.py delete mode 100644 megatron/checkpoint/deepspeed_checkpoint.py delete mode 100644 megatron/checkpoint/reshape_meg_2d.py diff --git a/megatron/checkpoint/__init__.py b/megatron/checkpoint/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/megatron/checkpoint/conversion_utils.py b/megatron/checkpoint/conversion_utils.py deleted file mode 100644 index 6bf2f2f96..000000000 --- a/megatron/checkpoint/conversion_utils.py +++ /dev/null @@ -1,26 +0,0 @@ -import os - - -def validate_files(file_list): - for file in file_list: - if not os.path.isfile(file): - print(f'Error: {file} is not existent') - - -def get_files(dir): - file_list = [] - for root, dirs, files in os.walk(dir): - for file in files: - file_list.append(os.path.join(root, file)) - return file_list - - -def partition_data(data_list, num_partitions): - num_elems = len(data_list) - assert num_elems % num_partitions == 0 - partition_size = num_elems // num_partitions - partitions_list = [ - data_list[i:i + partition_size] - for i in range(0, num_elems, partition_size) - ] - return partitions_list diff --git a/megatron/checkpoint/deepspeed_checkpoint.py b/megatron/checkpoint/deepspeed_checkpoint.py deleted file mode 100644 index 0067b7dd8..000000000 --- a/megatron/checkpoint/deepspeed_checkpoint.py +++ /dev/null @@ -1,294 +0,0 @@ -import os -from typing import Dict -import torch -from .conversion_utils import partition_data, get_files -from .reshape_meg_2d import reshape_meg_2d_parallel, meg_2d_parallel_map -from megatron.tokenizer.tokenizer import _vocab_size_with_padding - -ZERO_FILE_PREFIX = 'zero_pp_rank_' -LAYER_FILE_PREFIX = 'layer_' -MP_RANK_FILE_PREFIX = 'mp_rank_' -EMBEDDING_LAYER_INDEX = 0 -FINAL_LAYER_NORM_INDEX = -1 -ARGS_KEY = 'args' -CHECKPOINT_INFO_KEY = 'checkpoint_info' -ITERATION_KEY = 'iteration' -SEQUENTIAL_LAYERS = [ - 'input_layernorm.weight', 'input_layernorm.bias', - 'self_attention.dense.bias', 'post_attention_layernorm.weight', - 'post_attention_layernorm.bias', 'mlp.dense_4h_to_h.bias', - 'position_embeddings.weight' -] - -LAYER_CONCAT_DIM = { - 'self_attention.dense.weight': 1, - 'mlp.dense_4h_to_h.weight': 1 -} - -WORD_EMBEDDINGS_KEY = 'word_embeddings.weight' -ORIGINAL_VOCAB_SIZE = 'original_vocab_size' -PADDED_VOCAB_SIZE = 'padded_vocab_size' - -class DeepSpeedCheckpoint(object): - def __init__(self, dir, tp_degree=None, pp_degree=None): - self.dir = dir - self._validate_folder(dir) - - self.file_list = get_files(dir) - self.zero_files = self._get_files_with_prefix(self.file_list, - ZERO_FILE_PREFIX) - self.layer_files = self._get_files_with_prefix(self.file_list, - LAYER_FILE_PREFIX) - self.mp_rank_files = self._get_files_with_prefix( - self.file_list, MP_RANK_FILE_PREFIX) - - self.layer_keys = self._get_layer_keys() - self.layer_count = len(self.layer_keys) - self.original_tp_degree = len( - self._get_files_with_prefix(self.layer_files, - f'{LAYER_FILE_PREFIX}01')) - self.original_pp_degree = len( - self.mp_rank_files) // self.original_tp_degree - self.dp_degree = max( - 1, - len(self.zero_files) // - (self.original_pp_degree * self.original_tp_degree)) - self.tp_degree = self.original_tp_degree if tp_degree is None else tp_degree - self.pp_degree = self.original_pp_degree if pp_degree is None else pp_degree - self.old_2d_map = meg_2d_parallel_map(self.original_pp_degree, - self.original_tp_degree) - self.old_2d_map.simple_init() - self.new_2d_map = reshape_meg_2d_parallel( - old_pp_degree=self.original_pp_degree, - old_tp_degree=self.original_tp_degree, - new_pp_degree=self.pp_degree, - new_tp_degree=self.tp_degree) - self.global_state = {} - - self._sanity_check() - self.pp_to_transformer_map = self._build_pp_transformer_map() - self.transformer_file_map = self._build_transformer_file_map() - self.tp_to_embedding_map = self._build_tp_other_layer_map( - EMBEDDING_LAYER_INDEX) - self.tp_to_final_norm_map = self._build_tp_other_layer_map( - FINAL_LAYER_NORM_INDEX) - self._build_global_state() - - def show_2d_mapping(self): - print(f'reshaped 2d map ---- begin') - - for i in range(self.pp_degree): - for j in range(self.tp_degree): - file_list = self.get_2d_parallel_files(pp_index=i, tp_index=j) - print(f'[{i}, {j}] = {file_list}') - - print(f'reshaped 2d map ---- end') - - def show_tp_embedding_map(self): - self._dump_mapping(self.tp_to_embedding_map, 'tp_to_embedding_layers') - - def show_tp_final_norm_map(self): - self._dump_mapping(self.tp_to_final_norm_map, - 'tp_to_final_norm_layers') - - def show_pp_tranformer_map(self): - self._dump_mapping(self.pp_to_transformer_map, - 'pp_to_tranformer_layers') - - def show_transformer_file_map(self): - self._dump_mapping(self.transformer_file_map, - 'rank_to_tranformer_files') - - def _build_global_state(self): - sd = torch.load(self.mp_rank_files[0], - map_location=torch.device('cpu')) - self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) - self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None) - - def get_embedding_layer_id(self): - return self.layer_keys[EMBEDDING_LAYER_INDEX] - - def get_final_norm_layer_id(self): - return self.layer_keys[FINAL_LAYER_NORM_INDEX] - - def get_iteration(self): - if not ITERATION_KEY in self.global_state: - sd = torch.load(self.mp_rank_files[0], - map_location=torch.device('cpu')) - self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0) - - return self.global_state[ITERATION_KEY] - - def get_embedding_state(self, tp_index: int) -> Dict: - assert tp_index in self.tp_to_embedding_map.keys() - sd_list = [ - torch.load(fname, map_location=torch.device('cpu')) - for fname in self.tp_to_embedding_map[tp_index] - ] - sd = self._merge_state_dicts(sd_list) - sd[WORD_EMBEDDINGS_KEY] = self._strip_vocab_padding(sd[WORD_EMBEDDINGS_KEY]) - return sd - - def _get_checkpoint_value(self, key): - if not key in self.global_state: - sd = torch.load(self.mp_rank_files[0], - map_location=torch.device('cpu')) - self.global_state[key] = sd.get(key, None) - - return self.global_state[key] - - def get_args(self): - return self._get_checkpoint_value(ARGS_KEY) - - - def get_checkpoint_info(self): - return self._get_checkpoint_value(CHECKPOINT_INFO_KEY) - - def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict: - assert tp_index < self.tp_degree - assert pp_index < self.pp_degree - fname_list = self.get_2d_parallel_files(tp_index=tp_index, - pp_index=pp_index) - sd_list = [ - torch.load(fname, map_location=torch.device('cpu')) - for fname in fname_list - ] - # HACK HACK HACK, should be merging i.e., sd = self._merge_state_dicts(sd_list) - sd = sd_list[0] - return sd - - def get_transformer_state(self, tp_index: int, pp_index: int) -> list: - assert tp_index < self.tp_degree - assert pp_index < self.pp_degree - t_list = [] - for fname_list in self.transformer_file_map[(tp_index, pp_index)]: - sd_list = [ - torch.load(fname, map_location=torch.device('cpu')) - for fname in fname_list - ] - sd = self._merge_state_dicts(sd_list) - t_list.append(sd) - return t_list - - def get_pp_transformer_map(self, pp_index: int) -> list: - assert pp_index < self.pp_degree - return self.pp_to_transformer_map[pp_index] - - def get_final_norm_state(self, tp_index: int) -> Dict: - assert tp_index in self.tp_to_final_norm_map.keys() - sd = torch.load(self.tp_to_final_norm_map[tp_index][0], - map_location=torch.device('cpu')) - return sd - - def _build_tp_other_layer_map(self, layer_index: int): - assert layer_index < len(self.layer_files) - layer_files = self._get_files_with_prefix(self.layer_files, - self.layer_keys[layer_index]) - layer_file_partitions = partition_data(layer_files, self.tp_degree) - data_map = {i: flist for i, flist in enumerate(layer_file_partitions)} - return data_map - - def get_2d_parallel_files(self, tp_index: int, pp_index: int) -> list: - assert tp_index < self.tp_degree - assert pp_index < self.pp_degree - file_indices = self.new_2d_map.get_data(pp_index=pp_index, - tp_index=tp_index) - return [self.mp_rank_files[i] for i in file_indices] - - def _build_pp_transformer_map(self): - data_map = {} - transformer_layers = self.layer_keys[1:-1] - layers_per_pp = len(transformer_layers) // self.pp_degree - data_map = { - i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] - for i in range(0, self.pp_degree) - } - return data_map - - def _dump_mapping(self, data_map, map_tag=None): - if map_tag is not None: - print(f'Dump mapping: {map_tag}') - for k, v in data_map.items(): - print(f'{k} = {v}') - - def _build_transformer_file_map(self): - transformer_layer_keys = self.layer_keys[1:-1] - file_map = {} - layers_per_pp = len(transformer_layer_keys) // self.pp_degree - for key_index, layer_key in enumerate(transformer_layer_keys): - pp_index = key_index // layers_per_pp - layer_files = self._get_files_with_prefix(self.layer_files, - layer_key) - layer_file_partitions = partition_data(layer_files, self.tp_degree) - for tp_index in range(self.tp_degree): - map_key = (tp_index, pp_index) - if not map_key in file_map.keys(): - file_map[map_key] = [] - file_map[map_key].append(layer_file_partitions[tp_index]) - - return file_map - - def _sanity_check(self): - assert len(self.mp_rank_files) % self.tp_degree == 0 - assert len(self.zero_files) % (self.pp_degree * self.tp_degree) == 0 - assert len(self.layer_keys) > 2 - assert (len(self.layer_keys) - 2) % self.pp_degree == 0 - - def _get_files_with_prefix(self, all_files, prefix): - file_list = [] - for file_path in all_files: - _, fname = os.path.split(file_path) - if fname.startswith(prefix): - file_list.append(file_path) - - return sorted(file_list) - - - def validate_files(self): - for file in self.file_list: - if not os.path.isfile(file): - print(f'Error: {file} is not existent') - - - def _get_layer_keys(self): - key_set = set() - key_len = len(LAYER_FILE_PREFIX) + 2 - for file_path in self.layer_files: - _, fname = os.path.split(file_path) - key_set.add(fname[:key_len]) - return sorted(list(key_set)) - - def _merge_state_dicts(self, sd_list): - merged_sd = {} - for key in sd_list[0].keys(): - if not key in SEQUENTIAL_LAYERS: - cat_dim = LAYER_CONCAT_DIM.get(key, 0) - merged_sd[key] = torch.cat([sd[key] for sd in sd_list], - dim=cat_dim) - else: - merged_sd[key] = sd_list[0][key] - - return merged_sd - - - def _strip_vocab_padding(self, padded_vocab_tensor): - target_args = self.get_args() - checkpoint_info = self.get_checkpoint_info() - target_args.tensor_model_parallel_size = self.tp_degree - target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args) - assert target_args.padded_vocab_size <= padded_vocab_tensor.numel() - checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size - unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size) - return unpadded_vocab_tensor.clone() - - - def _validate_folder(self, dir): - assert os.path.exists(dir), f'{dir} path does not exist' - assert os.path.isdir(dir), f'{dir} is not a folder' - - file_list = get_files(dir) - - for file_prefix in [MP_RANK_FILE_PREFIX, LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01']: - ckpt_files = self._get_files_with_prefix(file_list, file_prefix) - assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.' - diff --git a/megatron/checkpoint/reshape_meg_2d.py b/megatron/checkpoint/reshape_meg_2d.py deleted file mode 100644 index 46bc34c27..000000000 --- a/megatron/checkpoint/reshape_meg_2d.py +++ /dev/null @@ -1,104 +0,0 @@ -from .conversion_utils import partition_data - - -class meg_2d_parallel_map(object): - def __init__(self, pp_degree, tp_degree): - self.pp_degree = pp_degree - self.tp_degree = tp_degree - self.map = {} - - def simple_init(self): - self.map = { - self._make_key(i // self.tp_degree, i % self.tp_degree): [i] - for i in range(self.pp_degree * self.tp_degree) - } - - def add_data(self, pp_index, tp_index, data): - self._validate_indices(pp_index, tp_index) - assert type(data) is list - - key = self._make_key(pp_index, tp_index) - if not key in self.map.keys(): - self.map[key] = [] - self.map[key] += data - - def get_data(self, pp_index=None, tp_index=None): - self._validate_indices(pp_index, tp_index) - pp_indices = list(range( - self.pp_degree)) if pp_index is None else [pp_index] - tp_indices = list(range( - self.tp_degree)) if tp_index is None else [tp_index] - - result = [] - for i in pp_indices: - for j in tp_indices: - result += self.map[self._make_key(i, j)] - - return result - - def print_data(self, tag): - print(f'{tag}') - for key, value in self.map.items(): - print(f'{key} = {value}') - - def _validate_indices(self, pp_index, tp_index): - assert pp_index is None or pp_index < self.pp_degree - assert tp_index is None or tp_index < self.tp_degree - - def _make_key(self, i, j): - return f'{i},{j}' - - -def _reshape_tp_dimension(old_2d_map, new_tp_degree): - old_pp_degree = old_2d_map.pp_degree - new_2d_map = meg_2d_parallel_map(old_pp_degree, new_tp_degree) - for i in range(old_pp_degree): - ranks_for_pp_index = old_2d_map.get_data(pp_index=i, tp_index=None) - split_ranks = partition_data(ranks_for_pp_index, new_tp_degree) - for j in range(new_tp_degree): - new_2d_map.add_data(i, j, split_ranks[j]) - - return new_2d_map - - -def _reshape_pp_dimension(old_2d_map, new_pp_degree): - old_tp_degree = old_2d_map.tp_degree - new_2d_map = meg_2d_parallel_map(new_pp_degree, old_tp_degree) - for i in range(old_tp_degree): - ranks_for_tp_index = old_2d_map.get_data(pp_index=None, tp_index=i) - split_ranks = partition_data(ranks_for_tp_index, new_pp_degree) - for j in range(new_pp_degree): - new_2d_map.add_data(j, i, split_ranks[j]) - - return new_2d_map - - -def reshape_meg_2d_parallel(old_pp_degree, - old_tp_degree, - new_pp_degree, - new_tp_degree, - verbose=False): - assert new_pp_degree <= old_pp_degree - assert new_tp_degree <= old_tp_degree - - old_2d_map = meg_2d_parallel_map(old_pp_degree, old_tp_degree) - old_2d_map.simple_init() - if verbose: - old_2d_map.print_data(f'original_2d_map:') - - if old_tp_degree != new_tp_degree: - new_tp_map = _reshape_tp_dimension(old_2d_map, new_tp_degree) - else: - new_tp_map = old_2d_map - if verbose: - new_tp_map.print_data(f'after_tp_reshape:') - - if old_pp_degree != new_pp_degree: - final_map = _reshape_pp_dimension(new_tp_map, new_pp_degree) - else: - final_map = new_tp_map - - if verbose: - final_map.print_data(f'final_2d_map:') - - return final_map diff --git a/tools/convert_checkpoint/deepspeed_to_deepspeed.py b/tools/convert_checkpoint/deepspeed_to_deepspeed.py index f281f0365..6d0cbbb22 100644 --- a/tools/convert_checkpoint/deepspeed_to_deepspeed.py +++ b/tools/convert_checkpoint/deepspeed_to_deepspeed.py @@ -3,6 +3,7 @@ import argparse import os import torch + from pathlib import Path # insert megatron's root dir into sys.path @@ -10,10 +11,24 @@ if root_repo_path not in sys.path: sys.path.insert(0, root_repo_path) -from megatron.checkpoint.deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint, MP_RANK_FILE_PREFIX, PADDED_VOCAB_SIZE, CHECKPOINT_INFO_KEY +from megatron.tokenizer.tokenizer import _vocab_size_with_padding +from deepspeed.checkpoint.constants import ( + ARGS_KEY, + CHECKPOINT_INFO_KEY, +) + +from deepspeed.checkpoint import ( + DeepSpeedCheckpoint, + get_model_ckpt_name_for_rank, + get_zero_ckpt_name_for_rank, + get_layer_ckpt_name_for_rank +) CHECKPOINT_FILE_SUFFIX = '_model_states.pt' MP_WORLD_SIZE ='mp_world_size' +WORD_EMBEDDINGS_KEY = 'word_embeddings.weight' +ORIGINAL_VOCAB_SIZE = 'original_vocab_size' +PADDED_VOCAB_SIZE = 'padded_vocab_size' def parse_arguments(): parser = argparse.ArgumentParser() @@ -33,6 +48,10 @@ def parse_arguments(): default=None, type=int, help='Target PP degree') + parser.add_argument('--target_dp', + default=None, + type=int, + help='Target DP degree') args = parser.parse_args() print(f'args = {args}') return args @@ -62,13 +81,26 @@ def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index) assert len(sd_list) == len(layer_id_list) for sd, layer_id in zip(sd_list, layer_id_list): - ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, - layer_id) +# ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, +# layer_id) + ckpt_path = get_layer_ckpt_name_for_rank(base_folder, layer_id, tp_index) _save_checkpoint(ckpt_path, sd) +def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor): + target_args = ds_checkpoint.get_args() + checkpoint_info = ds_checkpoint.get_checkpoint_info() + target_args.tensor_model_parallel_size = ds_checkpoint.tp_degree + target_args.padded_vocab_size = _vocab_size_with_padding(checkpoint_info[ORIGINAL_VOCAB_SIZE], target_args) + assert target_args.padded_vocab_size <= padded_vocab_tensor.numel() + checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size + unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size) + return unpadded_vocab_tensor.clone() + + def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index): sd = ds_checkpoint.get_embedding_state(tp_index) + sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY]) layer_id = ds_checkpoint.get_embedding_layer_id() ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, layer_id) _save_checkpoint(ckpt_path, sd) @@ -87,7 +119,8 @@ def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index=pp_index) sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree file_id = pp_index * ds_checkpoint.tp_degree + tp_index - ckpt_path = _create_2d_checkpoint_path(base_folder, file_id) +# ckpt_path = _create_2d_checkpoint_path(base_folder, file_id) + ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}') # Adjust specific fields sd[ARGS_KEY] = ds_checkpoint.get_args() @@ -98,6 +131,18 @@ def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index, _save_checkpoint(ckpt_path, sd) +def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_index): + + _2d_rank = (pp_index * ds_checkpoint.tp_degree) + tp_index + global_rank = (dp_index * ds_checkpoint.pp_degree * ds_checkpoint.tp_degree) + _2d_rank + sd = ds_checkpoint.get_zero_checkpoint_state(global_rank=global_rank) + + ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder, + dp_rank=dp_index, + mp_rank=_2d_rank) + _save_checkpoint(ckpt_path, sd) + + def _create_latest_file(base_folder, file_name, latest_tag): file_path = os.path.join(base_folder, file_name) os.makedirs(base_folder, exist_ok=True) @@ -113,8 +158,11 @@ def main(): f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}' ) - ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, - args.target_pp) + ds_checkpoint = DeepSpeedCheckpoint( + args.input_folder, + args.target_tp, + args.target_pp, + args.target_dp) iteration = ds_checkpoint.get_iteration() latest_tag = f'global_step{iteration}' _create_latest_file(args.output_folder, @@ -130,6 +178,12 @@ def main(): j) _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j) + + for i in range(ds_checkpoint.dp_degree): + for j in range(ds_checkpoint.pp_degree): + for k in range(ds_checkpoint.tp_degree): + _create_zero_checkpoint(ds_checkpoint, base_folder, i, j, k) + if __name__ == "__main__": main() From 29ca2bcc92e75a51ca9555d662550d92fbad0ec2 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 23 Feb 2022 17:38:47 +0000 Subject: [PATCH 09/33] Use DS 3D reshaping utils --- .../deepspeed_to_deepspeed.py | 50 ++++++++----------- .../deepspeed_to_megatron.py | 2 +- .../deepspeed_to_transformers.py | 2 +- .../inspect_deepspeed_checkpoint.py | 3 +- 4 files changed, 25 insertions(+), 32 deletions(-) diff --git a/tools/convert_checkpoint/deepspeed_to_deepspeed.py b/tools/convert_checkpoint/deepspeed_to_deepspeed.py index 6d0cbbb22..f7605f56c 100644 --- a/tools/convert_checkpoint/deepspeed_to_deepspeed.py +++ b/tools/convert_checkpoint/deepspeed_to_deepspeed.py @@ -12,7 +12,7 @@ sys.path.insert(0, root_repo_path) from megatron.tokenizer.tokenizer import _vocab_size_with_padding -from deepspeed.checkpoint.constants import ( +from deepspeed.checkpoint.deepspeed_checkpoint import ( ARGS_KEY, CHECKPOINT_INFO_KEY, ) @@ -57,17 +57,6 @@ def parse_arguments(): return args -def _create_layer_checkpoint_path(base_folder, tp_rank, layer_id): - ckpt_file = f'{layer_id}-model_{tp_rank:02d}{CHECKPOINT_FILE_SUFFIX}' - ckpt_path = os.path.join(base_folder, ckpt_file) - return ckpt_path - - -def _create_2d_checkpoint_path(base_folder, file_index): - ckpt_file = f'{MP_RANK_FILE_PREFIX}{file_index:02d}{CHECKPOINT_FILE_SUFFIX}' - ckpt_path = os.path.join(base_folder, ckpt_file) - return ckpt_path - def _save_checkpoint(file_path, chkpt_sd): dir, _ = os.path.split(file_path) @@ -75,15 +64,15 @@ def _save_checkpoint(file_path, chkpt_sd): torch.save(chkpt_sd, file_path) -def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, - pp_index): +def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index): sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index) layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index) assert len(sd_list) == len(layer_id_list) for sd, layer_id in zip(sd_list, layer_id_list): -# ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, -# layer_id) - ckpt_path = get_layer_ckpt_name_for_rank(base_folder, layer_id, tp_index) + ckpt_path = get_layer_ckpt_name_for_rank( + base_folder=base_folder, + layer_id=layer_id, + tp_rank=tp_index) _save_checkpoint(ckpt_path, sd) @@ -100,16 +89,23 @@ def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor): def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index): sd = ds_checkpoint.get_embedding_state(tp_index) - sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY]) + if ds_checkpoint.is_change_tp_degree(): + sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY]) layer_id = ds_checkpoint.get_embedding_layer_id() - ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, layer_id) + ckpt_path = get_layer_ckpt_name_for_rank( + base_folder=base_folder, + tp_rank=tp_index, + layer_id=layer_id) _save_checkpoint(ckpt_path, sd) def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index): sd = ds_checkpoint.get_final_norm_state(tp_index) layer_id = ds_checkpoint.get_final_norm_layer_id() - ckpt_path = _create_layer_checkpoint_path(base_folder, tp_index, layer_id) + ckpt_path = get_layer_ckpt_name_for_rank( + base_folder=base_folder, + tp_rank=tp_index, + layer_id=layer_id) _save_checkpoint(ckpt_path, sd) @@ -119,7 +115,6 @@ def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index=pp_index) sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree file_id = pp_index * ds_checkpoint.tp_degree + tp_index -# ckpt_path = _create_2d_checkpoint_path(base_folder, file_id) ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}') # Adjust specific fields @@ -127,15 +122,15 @@ def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index, sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size - _save_checkpoint(ckpt_path, sd) def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_index): - _2d_rank = (pp_index * ds_checkpoint.tp_degree) + tp_index - global_rank = (dp_index * ds_checkpoint.pp_degree * ds_checkpoint.tp_degree) + _2d_rank - sd = ds_checkpoint.get_zero_checkpoint_state(global_rank=global_rank) + sd = ds_checkpoint.get_zero_checkpoint_state( + pp_index=pp_index, + tp_index=tp_index, + dp_index=dp_index) ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder, dp_rank=dp_index, @@ -173,11 +168,10 @@ def main(): for i in range(ds_checkpoint.tp_degree): _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i) _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i) + for j in range(ds_checkpoint.pp_degree): - _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, - j) + _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j) _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j) - for i in range(ds_checkpoint.dp_degree): for j in range(ds_checkpoint.pp_degree): diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py index bb8a94d32..368826e2b 100755 --- a/tools/convert_checkpoint/deepspeed_to_megatron.py +++ b/tools/convert_checkpoint/deepspeed_to_megatron.py @@ -13,7 +13,7 @@ sys.path.insert(0, root_repo_path) -from megatron.checkpoint.deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint +from deepspeed.checkpoint import DeepSpeedCheckpoint MODEL_KEY = 'model' ARGS_KEY = 'args' diff --git a/tools/convert_checkpoint/deepspeed_to_transformers.py b/tools/convert_checkpoint/deepspeed_to_transformers.py index 3f1f00d5e..015f63a94 100755 --- a/tools/convert_checkpoint/deepspeed_to_transformers.py +++ b/tools/convert_checkpoint/deepspeed_to_transformers.py @@ -11,7 +11,7 @@ if root_repo_path not in sys.path: sys.path.insert(0, root_repo_path) -from megatron.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint +from deepspeed.checkpoint import DeepSpeedCheckpoint from deepspeed_to_megatron import _create_rank_checkpoint, parse_arguments # the import was tested to work with this version diff --git a/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py b/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py index a3efa1c85..09fa60991 100644 --- a/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py +++ b/tools/convert_checkpoint/inspect_deepspeed_checkpoint.py @@ -8,8 +8,7 @@ import argparse -from megatron.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint -from megatron.checkpoint.reshape_meg_2d import reshape_meg_2d_parallel +from deepspeed.checkpoint import DeepSpeedCheckpoint def list_files(file_list, tag): From 6d863582fb34214c41680df450ac0b57a65b91a2 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 18 May 2022 13:43:08 -0700 Subject: [PATCH 10/33] convert to bf16 --- tests/ds_config_bf16.json | 14 ++++++++++++++ tests/test_checkpoints.py | 9 ++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tests/ds_config_bf16.json diff --git a/tests/ds_config_bf16.json b/tests/ds_config_bf16.json new file mode 100644 index 000000000..6afd1f6b2 --- /dev/null +++ b/tests/ds_config_bf16.json @@ -0,0 +1,14 @@ +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 16, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 0 + }, + "bf16": { + "enabled": true + }, + "zero_allow_untested_optimizer": true, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index d3be67234..79e985365 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -64,8 +64,8 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): ds_args = f""" --deepspeed - --deepspeed_config {self.test_file_dir_str}/ds_config.json - --zero-stage 1 + --deepspeed_config {self.test_file_dir_str}/ds_config_bf16.json + --zero-stage 0 --deepspeed-activation-checkpointing """.split() @@ -103,6 +103,9 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): --rampup-batch-size 2 2 {n_samples} --train-samples {n_samples} + --embed-layernorm + --position-embedding-type alibi + --optimizer adam --adam-beta1 0.9 --adam-beta2 0.95 @@ -112,7 +115,7 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): --lr-decay-samples 6 --clip-grad 1.0 --weight-decay 1e-1 - --fp16 + --bf16 --log-level debug --log-level-replica info From 804b497d010787648bc9e7e06667e11a34884afc Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 20 May 2022 13:12:50 -0700 Subject: [PATCH 11/33] wip universal chkpt --- tools/convert_checkpoint/ds_to_universal.py | 151 ++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100755 tools/convert_checkpoint/ds_to_universal.py diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py new file mode 100755 index 000000000..5997d7be3 --- /dev/null +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python + +import argparse +import os +import torch +from collections import OrderedDict +import sys +from pathlib import Path +from pprint import pprint + +# insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[2]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) + + +from deepspeed.checkpoint import DeepSpeedCheckpoint + +MODEL_KEY = 'model' +ARGS_KEY = 'args' +LANGUGAGE_MODEL_KEY = 'language_model' +EMBEDDING_KEY = 'embedding' +ENCODER_KEY = 'encoder' +WORD_EMBEDDINGS_FOR_HEAD_KEY = 'word_embeddings_for_head' +WORD_EMBEDDINGS_KEY = 'word_embeddings' +FINAL_LAYER_NORM_KEY = 'final_layernorm' +CHECKPOINT_VERSION_KEY = 'checkpoint_version' +CHECKPOINT_VERSION_VALUE = 3.0 +ITERATION_KEY = 'iteration' + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--input_folder', + default=None, + type=str, + help='Input DeepSpeed Checkpoint folder') + parser.add_argument('--output_folder', + default=None, + type=str, + help='Output Megatron checkpoint folder') + parser.add_argument('--target_tp', + default=1, + type=int, + help='Target TP degree') + parser.add_argument('--target_pp', + default=1, + type=int, + help='Target PP degree') + parser.add_argument( + '--for_release', + action='store_true', + help='Convert for release purpose, reset some (progress) counters.') + args = parser.parse_args() + print(f'args = {args}') + return args + + +def _convert_ds_transformer_state(sd_list): + new_sd = OrderedDict() + for i, sd in enumerate(sd_list): + for key, value in sd.items(): + new_key = f'layers.{i}.{key}' + new_sd[new_key] = value + + return new_sd + + +def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree): + path_list = [] + iter_folder = f'iter_{iteration:07d}' + for i in range(0, tp_degree): + path_list.append([]) + for j in range(0, pp_degree): + rank_folder = f'mp_rank_{i:02d}' if pp_degree == 1 else f'mp_rank_{i:02d}_{j:03d}' + ckpt_path = os.path.join(rank_folder, 'model_optim_rng.pt') + path_list[i].append( + os.path.join(base_folder, iter_folder, ckpt_path)) + + return path_list + + +def _create_megatron_dict(): + language_model_dict = {EMBEDDING_KEY: {}, ENCODER_KEY: {}} + megatron_dict = { + MODEL_KEY: { + LANGUGAGE_MODEL_KEY: language_model_dict + }, + CHECKPOINT_VERSION_KEY: CHECKPOINT_VERSION_VALUE + } + return megatron_dict + + +def _save_checkpoint(file_path, chkpt_sd): + dir, _ = os.path.split(file_path) + os.makedirs(dir, exist_ok=True) + torch.save(chkpt_sd, file_path) + + + +def _create_latest_file(base_folder, iteration): + file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt') + os.makedirs(base_folder, exist_ok=True) + with open(file_path, 'w') as f: + f.write(str(iteration)) + + +def save_params_universal(dir, param_shapes): + for name, shape in param_shapes.items(): + param_base_path = os.path.join(dir, name) + os.makedirs(param_base_path, exist_ok=True) + print(f"{name}: {shape} => {param_base_path}") + for state in ("fp32", "momentum", "variance"): + path = os.path.join(param_base_path, state) + param = torch.Tensor(shape) + _save_checkpoint(path, param) + + +def main(): + print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') + + args = parse_arguments() + print( + f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}' + ) + + ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, + args.target_pp) + iteration = ds_checkpoint.get_iteration() + _create_latest_file(args.output_folder, iteration) + checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, + ds_checkpoint.tp_degree, + ds_checkpoint.pp_degree) + + sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu')) + + param_shapes = sd["param_shapes"] + # fix back to normal dict + param_shapes = dict((k,v) for d in param_shapes for k,v in d.items() ) + pprint(param_shapes) + + save_params_universal(args.output_folder, param_shapes) + + # for i in range(0, ds_checkpoint.tp_degree): + # for j in range(0, ds_checkpoint.pp_degree): + # sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release) + # _save_checkpoint(checkpoint_paths[i][j], sd) + + +if __name__ == "__main__": + main() From c29d336904efacff1e7a828e2f02bc8a85ba1df0 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 20 May 2022 13:17:49 -0700 Subject: [PATCH 12/33] rename --- tools/convert_checkpoint/ds_to_universal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 5997d7be3..92793816a 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -110,7 +110,7 @@ def save_params_universal(dir, param_shapes): param_base_path = os.path.join(dir, name) os.makedirs(param_base_path, exist_ok=True) print(f"{name}: {shape} => {param_base_path}") - for state in ("fp32", "momentum", "variance"): + for state in ("fp32", "exp_avg", "exp_avg_sq"): path = os.path.join(param_base_path, state) param = torch.Tensor(shape) _save_checkpoint(path, param) From 9c4479338b82102741a5ab402080ee84081319f7 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 20 May 2022 13:19:36 -0700 Subject: [PATCH 13/33] rename --- tools/convert_checkpoint/ds_to_universal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 92793816a..94f6879ee 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -111,7 +111,7 @@ def save_params_universal(dir, param_shapes): os.makedirs(param_base_path, exist_ok=True) print(f"{name}: {shape} => {param_base_path}") for state in ("fp32", "exp_avg", "exp_avg_sq"): - path = os.path.join(param_base_path, state) + path = os.path.join(param_base_path, f"{state}.pt") param = torch.Tensor(shape) _save_checkpoint(path, param) @@ -137,7 +137,7 @@ def main(): param_shapes = sd["param_shapes"] # fix back to normal dict param_shapes = dict((k,v) for d in param_shapes for k,v in d.items() ) - pprint(param_shapes) + #pprint(param_shapes) save_params_universal(args.output_folder, param_shapes) From 7e0a81b98937df18dec6b58130059e7c8d789190 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 24 May 2022 15:12:22 -0700 Subject: [PATCH 14/33] wip on fragments dealing --- tools/convert_checkpoint/ds_to_universal.py | 128 ++++++++++++++++++-- 1 file changed, 116 insertions(+), 12 deletions(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 94f6879ee..a873dfeed 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -7,6 +7,8 @@ import sys from pathlib import Path from pprint import pprint +from copy import deepcopy +import glob # insert megatron's root dir into sys.path root_repo_path = str(Path(__file__).resolve().parents[2]) @@ -104,7 +106,7 @@ def _create_latest_file(base_folder, iteration): with open(file_path, 'w') as f: f.write(str(iteration)) - +# XXX: this is a temp hack that creates fake params but with the right shapes def save_params_universal(dir, param_shapes): for name, shape in param_shapes.items(): param_base_path = os.path.join(dir, name) @@ -116,6 +118,105 @@ def save_params_universal(dir, param_shapes): _save_checkpoint(path, param) +def extract_zero_fragments(dir, param_shapes, ds_checkpoint, dp_index, pp_index, tp_index): + sd = ds_checkpoint.get_zero_checkpoint_state( + pp_index=pp_index, + tp_index=tp_index, + dp_index=dp_index) + + pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}") + + optim_sd = sd["optimizer_state_dict"] + param_slice_mappings = optim_sd["param_slice_mappings"] + + # dict + state_groups = optim_sd["base_optimizer_state"]["state"] + # list + fp32_groups = optim_sd["single_partition_of_fp32_groups"] + param_groups_cnt = len(state_groups) + + for param_group_id in range(param_groups_cnt): + + flat_state = dict( + exp_avg=state_groups[param_group_id]["exp_avg"], + exp_avg_sq=state_groups[param_group_id]["exp_avg_sq"], + fp32=fp32_groups[param_group_id], + ) + + for k,v in param_slice_mappings[param_group_id].items(): + print(f"{param_group_id} {k} => {v.start}:{v.numel}") + + for state_key in flat_state.keys(): + dump_param_fragment(dir, state_key, flat_state[state_key], k, v.start, v.numel) + + # XXX: add validation based on param_shapes + + + #pprint(f"{param_group_id} {exp_avg.numel()=} {exp_avg_sq.numel()=} {fp32.numel()=} ") + + + +cnt = 0 +def dump_param_fragment(dir, state_name, state_flat_tensor, param_name, offset, numel): + + global cnt # temp hack + + param_base_path = os.path.join(dir, param_name) + os.makedirs(param_base_path, exist_ok=True) + + cnt += 1 + counter = f"{cnt:0>10d}" + + path = os.path.join(param_base_path, f"{state_name}.{counter}") + + print(f"{param_name}: {offset}: {numel} => {path}") + + t = state_flat_tensor.narrow(0, offset, numel) + _save_checkpoint(path, t) + + # XXX: reshape to shape + + +def merge_zero_fragments(dir, param_shapes): + + for name, shape in param_shapes.items(): + param_base_path = os.path.join(dir, name) + print(f"\n{name}: {shape} => {param_base_path}") + + # XXX: shouldn't be in the states + if "position_embeddings" in name: + continue + + + for state in ("fp32", "exp_avg", "exp_avg_sq"): + final_path = os.path.join(param_base_path, f"{state}.pt") + prefix_path = os.path.join(param_base_path, f"{state}") + paths = sorted(list(glob.glob(f"{prefix_path}.0*"))) + orig_paths = deepcopy(paths) + + + # XXX: tmp hack - need to deal with tied vars here + if "word_embeddings" in name and len(paths)>1: + paths = [paths[0]] + + + print(paths) + + fragments = [torch.load(p) for p in paths] + + print(f"Expected shape: {shape}") + print(f"Fragment sizes:", list(frag.shape for frag in fragments)) + + # merge + param = torch.cat(fragments, dim=0) + param = param.reshape(shape) + print(f"Final shape: {param.shape}") + _save_checkpoint(final_path, param) + for p in orig_paths: + os.unlink(p) + + + def main(): print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') @@ -124,27 +225,30 @@ def main(): f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}' ) - ds_checkpoint = DeepSpeedCheckpoint(args.input_folder, args.target_tp, - args.target_pp) + ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)#, 1, 2) # args.target_tp, args.target_pp) iteration = ds_checkpoint.get_iteration() _create_latest_file(args.output_folder, iteration) checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree) - sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu')) + mp_sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu')) - param_shapes = sd["param_shapes"] - # fix back to normal dict + param_shapes = mp_sd["param_shapes"] + # fix back to normal flat dict param_shapes = dict((k,v) for d in param_shapes for k,v in d.items() ) - #pprint(param_shapes) - save_params_universal(args.output_folder, param_shapes) + # make fake params + # save_params_universal(args.output_folder, param_shapes) + + for i in range(ds_checkpoint.dp_degree): + for j in range(ds_checkpoint.pp_degree): + for k in range(ds_checkpoint.tp_degree): + print(f"{i=}, {j=}, {k=}") + extract_zero_fragments(args.output_folder, param_shapes, ds_checkpoint, i, j, k) + + merge_zero_fragments(args.output_folder, param_shapes) - # for i in range(0, ds_checkpoint.tp_degree): - # for j in range(0, ds_checkpoint.pp_degree): - # sd = _create_rank_checkpoint(ds_checkpoint, i, j, args.for_release) - # _save_checkpoint(checkpoint_paths[i][j], sd) if __name__ == "__main__": From d300512083934b11c9e6f8e79bd43badd79709fb Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 24 May 2022 15:26:14 -0700 Subject: [PATCH 15/33] cleanup --- tools/convert_checkpoint/ds_to_universal.py | 23 ++++++++++----------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index a873dfeed..19b6ec773 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -1,14 +1,15 @@ #!/usr/bin/env python -import argparse -import os -import torch from collections import OrderedDict -import sys +from copy import deepcopy from pathlib import Path from pprint import pprint -from copy import deepcopy +import argparse import glob +import logging +import os +import sys +import torch # insert megatron's root dir into sys.path root_repo_path = str(Path(__file__).resolve().parents[2]) @@ -149,10 +150,6 @@ def extract_zero_fragments(dir, param_shapes, ds_checkpoint, dp_index, pp_index, for state_key in flat_state.keys(): dump_param_fragment(dir, state_key, flat_state[state_key], k, v.start, v.numel) - # XXX: add validation based on param_shapes - - - #pprint(f"{param_group_id} {exp_avg.numel()=} {exp_avg_sq.numel()=} {fp32.numel()=} ") @@ -174,7 +171,6 @@ def dump_param_fragment(dir, state_name, state_flat_tensor, param_name, offset, t = state_flat_tensor.narrow(0, offset, numel) _save_checkpoint(path, t) - # XXX: reshape to shape def merge_zero_fragments(dir, param_shapes): @@ -196,8 +192,8 @@ def merge_zero_fragments(dir, param_shapes): # XXX: tmp hack - need to deal with tied vars here - if "word_embeddings" in name and len(paths)>1: - paths = [paths[0]] + if "word_embeddings.weight" in name and len(paths)>1: + paths = paths[:1] print(paths) @@ -215,6 +211,9 @@ def merge_zero_fragments(dir, param_shapes): for p in orig_paths: os.unlink(p) + # XXX: probably not needed since torch.reshape would have failed if the inputs size was wrong + if param.shape != shape: + logging.error(f"✘ {name}: expected {shape} but got {param.shape}") def main(): From ab0a7f8fff3cb6fc208a966008f301bbbcbe38f1 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 27 May 2022 01:57:55 +0500 Subject: [PATCH 16/33] Loading universal checkpoint with reshaping --- megatron/arguments.py | 2 + megatron/checkpointing.py | 3 +- run_bf16.sh | 44 ++++--- tests/test_checkpoints.py | 4 +- tools/convert_checkpoint/ds_to_universal.py | 127 ++++++++++++++------ 5 files changed, 121 insertions(+), 59 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 2be64b77d..bb9373d7d 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -635,6 +635,8 @@ def _add_learning_rate_args(parser): '(learning rate, warmup iterations, minimum learning ' 'rate, maximum number of iterations, and decay style ' 'from checkpoint and ignore input arguments.') + group.add_argument('--universal-checkpoint', action='store_true', + help='Loading a universal format checkpoint.') return parser diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index 51229819f..dacbec7dc 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -363,7 +363,8 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True assert args.consumed_valid_samples == 0 if 'args' in state_dict: checkpoint_args = state_dict['args'] - check_checkpoint_args(checkpoint_args) + if not args.universal_checkpoint: + check_checkpoint_args(checkpoint_args) args.consumed_train_samples = getattr(checkpoint_args, 'consumed_train_samples', 0) update_num_microbatches(consumed_samples=args.consumed_train_samples) diff --git a/run_bf16.sh b/run_bf16.sh index fd3a48398..528b7576a 100755 --- a/run_bf16.sh +++ b/run_bf16.sh @@ -12,10 +12,10 @@ DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` #DATASET_3="" #DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" -BASE_DATA_PATH=/data/Megatron-LM/data -DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron -VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json -MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt +BASE_DATA_PATH=tests/data/gpt2 +DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document +VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json +MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt script_path=$(realpath $0) @@ -26,28 +26,29 @@ CONFIG_JSON="/tmp/ds_config.json" USE_DEEPSPEED=1 ZERO_STAGE=0 - -# Debug #TP=4 #PP=4 -#LAYERS=8 -#HIDDEN=512 -#SEQ=1024 -#GLOBAL_BATCH=128 -#WORKER_STR="-i worker-0" - -TP=1 -PP=1 +# Debug +DEBUG_MODE=1 +if [[ $DEBUG_MODE == 1 ]]; then + LAYERS=4 + HIDDEN=512 + SEQ=512 +else + HIDDEN=1024 + LAYERS=24 + SEQ=1024 +fi + +TP=2 +PP=2 DP=2 WORLD_SIZE=$((TP*PP*DP)) -HIDDEN=1024 -LAYERS=24 -SEQ=1024 -GLOBAL_BATCH=1 -WORKER_STR="" +GLOBAL_BATCH=2 MICRO_BATCH=1 +CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} LR=6.0e-4 MIN_LR=6.0e-5 @@ -108,7 +109,10 @@ options=" \ --init-method-std 0.006 \ --${DTYPE} \ --checkpoint-activations \ - --exit-interval 10000 \ + --exit-interval 3 \ + --save ${CHECKPOINT_PATH} \ + --position-embedding-type alibi \ + --embed-layernorm \ --tensorboard-dir $LOG_DIR " diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 79e985365..f05c2f248 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -200,8 +200,8 @@ def reshape_checkpoint(self, input_dir, output_dir, target_tp_size, target_pp_si def test_checkpoint_reshaping_tp2_pp1_dp1(self): # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus - output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) + output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) + output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) # 1. train with TP=2 / PP=1 self.train_checkpoint(output_dir1, tp_size=2, pp_size=1, dp_size=1) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 19b6ec773..eddb22934 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -2,12 +2,14 @@ from collections import OrderedDict from copy import deepcopy +from email.policy import default from pathlib import Path from pprint import pprint import argparse import glob import logging import os +import shutil import sys import torch @@ -108,8 +110,8 @@ def _create_latest_file(base_folder, iteration): f.write(str(iteration)) # XXX: this is a temp hack that creates fake params but with the right shapes -def save_params_universal(dir, param_shapes): - for name, shape in param_shapes.items(): +def save_params_universal(dir, slice_shapes): + for name, shape in slice_shapes.items(): param_base_path = os.path.join(dir, name) os.makedirs(param_base_path, exist_ok=True) print(f"{name}: {shape} => {param_base_path}") @@ -119,7 +121,7 @@ def save_params_universal(dir, param_shapes): _save_checkpoint(path, param) -def extract_zero_fragments(dir, param_shapes, ds_checkpoint, dp_index, pp_index, tp_index): +def extract_zero_shards(dir, slice_shapes, ds_checkpoint, pp_index, tp_index, dp_index): sd = ds_checkpoint.get_zero_checkpoint_state( pp_index=pp_index, tp_index=tp_index, @@ -144,21 +146,24 @@ def extract_zero_fragments(dir, param_shapes, ds_checkpoint, dp_index, pp_index, fp32=fp32_groups[param_group_id], ) - for k,v in param_slice_mappings[param_group_id].items(): - print(f"{param_group_id} {k} => {v.start}:{v.numel}") + for name,fragment_mapping in param_slice_mappings[param_group_id].items(): + if "word_embeddings.weight" in name and pp_index > 0: + # Skip tied weights that are replicated in first and last pp stages + continue + print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}") for state_key in flat_state.keys(): - dump_param_fragment(dir, state_key, flat_state[state_key], k, v.start, v.numel) + dump_param_fragment(dir, tp_index, state_key, flat_state[state_key], name, fragment_mapping.start, fragment_mapping.numel) cnt = 0 -def dump_param_fragment(dir, state_name, state_flat_tensor, param_name, offset, numel): +def dump_param_fragment(dir, tp_index, state_name, state_flat_tensor, param_name, offset, numel): global cnt # temp hack - param_base_path = os.path.join(dir, param_name) + param_base_path = os.path.join(dir, param_name, str(tp_index)) os.makedirs(param_base_path, exist_ok=True) cnt += 1 @@ -172,44 +177,90 @@ def dump_param_fragment(dir, state_name, state_flat_tensor, param_name, offset, _save_checkpoint(path, t) +def _cleanup_zero_shard_files(param_base_path, state, tp_degree): + for tp_index in range(tp_degree): + prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") + for p in sorted(list(glob.glob(f"{prefix_path}.0*"))): + os.unlink(p) -def merge_zero_fragments(dir, param_shapes): - for name, shape in param_shapes.items(): +def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): + slices = [] + for tp_index in range(tp_degree): + prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") + paths = sorted(list(glob.glob(f"{prefix_path}.0*"))) + print(paths) + shards = [torch.load(p) for p in paths] + slices.append(torch.cat(shards, dim=0).reshape(slice_shape)) + + return slices + + +WEIGHTS_TO_AVERAGE_PATTERNS = [ + r"tied_modules.embed.word_embeddings.norm.weight", + r"tied_modules.embed.word_embeddings.norm.bias", + r"\d+.input_layernorm.weight", + r"\d+.input_layernorm.bias", + r"\d+.post_attention_layernorm.weight", + r"\d+.post_attention_layernorm.bias", + r"\d+.self_attention.dense.bias", + r"\d+.mlp.dense_4h_to_h.bias", + r"\d+.weight", + r"\d+.bias", +] + +WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [ + "dense_4h_to_h", + "self_attention.dense.weight", +] + +def merge_tp_slices(dir, slice_dir, slice_shapes, tp_degree): + + for name, shape in slice_shapes.items(): + slice_base_path = os.path.join(slice_dir, name) param_base_path = os.path.join(dir, name) - print(f"\n{name}: {shape} => {param_base_path}") + print(f"\n{name}: {shape} => {slice_base_path} -----> {param_base_path}") # XXX: shouldn't be in the states if "position_embeddings" in name: continue - for state in ("fp32", "exp_avg", "exp_avg_sq"): + slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) final_path = os.path.join(param_base_path, f"{state}.pt") - prefix_path = os.path.join(param_base_path, f"{state}") - paths = sorted(list(glob.glob(f"{prefix_path}.0*"))) - orig_paths = deepcopy(paths) + + +# for state in ("fp32", "exp_avg", "exp_avg_sq"): +# final_path = os.path.join(param_base_path, f"{state}.pt") +# prefix_path = os.path.join(param_base_path, f"{state}") +# paths = sorted(list(glob.glob(f"{prefix_path}.0*"))) +# orig_paths = deepcopy(paths) # XXX: tmp hack - need to deal with tied vars here - if "word_embeddings.weight" in name and len(paths)>1: - paths = paths[:1] +# if "word_embeddings.weight" in name and len(paths)>1: +# paths = paths[:1] - print(paths) +# print(paths) - fragments = [torch.load(p) for p in paths] +# shards = [torch.load(p) for p in paths] print(f"Expected shape: {shape}") - print(f"Fragment sizes:", list(frag.shape for frag in fragments)) + print(f"Fragment sizes:", list(frag.shape for frag in slices)) # merge - param = torch.cat(fragments, dim=0) - param = param.reshape(shape) + # XXX - Add merging strategy + import re + + if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS): + param = sum(slices) / len(slices) + else: + cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 + param = torch.cat(slices, dim=cat_dim) + print(f"Final shape: {param.shape}") _save_checkpoint(final_path, param) - for p in orig_paths: - os.unlink(p) # XXX: probably not needed since torch.reshape would have failed if the inputs size was wrong if param.shape != shape: @@ -225,29 +276,33 @@ def main(): ) ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)#, 1, 2) # args.target_tp, args.target_pp) + iteration = ds_checkpoint.get_iteration() _create_latest_file(args.output_folder, iteration) checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree) - mp_sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu')) + slice_shapes = [] + for mp_rank_file in ds_checkpoint.mp_rank_files: + mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu')) + slice_shapes += mp_sd["param_shapes"] - param_shapes = mp_sd["param_shapes"] - # fix back to normal flat dict - param_shapes = dict((k,v) for d in param_shapes for k,v in d.items() ) + # fix back to normal flat dict, merge duplicates for tp>1 + slice_shapes = dict((k,v) for d in slice_shapes for k,v in d.items() ) # make fake params - # save_params_universal(args.output_folder, param_shapes) - - for i in range(ds_checkpoint.dp_degree): - for j in range(ds_checkpoint.pp_degree): - for k in range(ds_checkpoint.tp_degree): + # save_params_universal(args.output_folder, slice_shapes) + temp_dir = os.path.join(args.output_folder, 'tmp') + for i in range(ds_checkpoint.pp_degree): + for j in range(ds_checkpoint.tp_degree): + for k in range(ds_checkpoint.dp_degree): print(f"{i=}, {j=}, {k=}") - extract_zero_fragments(args.output_folder, param_shapes, ds_checkpoint, i, j, k) - - merge_zero_fragments(args.output_folder, param_shapes) + extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) + merge_tp_slices(args.output_folder, temp_dir, slice_shapes, ds_checkpoint.tp_degree) + + shutil.rmtree(temp_dir, ignore_errors=True) if __name__ == "__main__": From d5e33dec9107759783779008be2ddf920f0683c6 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 27 May 2022 21:20:18 -0700 Subject: [PATCH 17/33] all gpu1<->2 reshapes work --- megatron/training.py | 46 +++++++- tests/test_checkpoints.py | 106 ++++++++++++++---- .../deepspeed_to_deepspeed.py | 28 ++--- tools/convert_checkpoint/ds_to_universal.py | 65 +++++------ 4 files changed, 172 insertions(+), 73 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 84fd4eb9d..21e64638b 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -367,6 +367,32 @@ def get_learning_rate_scheduler(optimizer): return lr_scheduler +def sync_hp_to_lp(optimizer): + + optimizer.update_lp_params() + + # for n,p in model.named_parameters(): + # print(n) + + # if p._hp_mapping is not None: + # #print(f'rank {rank} fixing hp for input_layernorm') + # #p._hp_mapping.update_hp() + + # hp = p._hp_mapping.hp_fragment + + + + # torch.distributed.all_reduce(hp, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + + # # 3. optim states + # for key in ['exp_avg', 'exp_avg_sq']: + # optim_state_fragment = p._hp_mapping.get_optim_state_fragment(key) + # #print(f'rank {rank} before reduce optim state fragment {key} = {optim_state_fragment}') + # torch.distributed.all_reduce(optim_state_fragment, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) + # #print(f'rank {rank} after reduce optim state fragment {key} = {optim_state_fragment}') + + + def setup_model_and_optimizer(model_provider_func): """Setup model and optimizer.""" args = get_args() @@ -382,12 +408,21 @@ def setup_model_and_optimizer(model_provider_func): if args.deepspeed: print_rank_0("DeepSpeed is enabled.") - pp = mpu.get_pipeline_model_parallel_world_size() + #pp = mpu.get_pipeline_model_parallel_world_size() + + import json + import io + with io.open(args.deepspeed_config, "r", encoding="utf-8") as f: + config = json.load(f) + if args.universal_checkpoint: + config["checkpoint"] = {"load_universal": True} + model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model[0], optimizer=optimizer, + lr_scheduler=lr_scheduler, + config=config, args=args, - lr_scheduler=lr_scheduler ) assert model.fp16_enabled() == args.fp16, "megatron fp16 config does not match deepspeed" @@ -412,6 +447,13 @@ def setup_model_and_optimizer(model_provider_func): torch.distributed.barrier() timers('load-checkpoint').stop() timers.log(['load-checkpoint']) + + + # hp -> lp + if args.deepspeed and args.universal_checkpoint: + sync_hp_to_lp(optimizer) + + else: args.iteration = 0 diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index f05c2f248..9603968e1 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -16,6 +16,7 @@ import os from pathlib import Path +from parameterized import parameterized from megatron.testing_utils import ( CaptureStdout, TestCasePlus, @@ -30,6 +31,37 @@ set_seed(42) +def parameterized_custom_name_func(func, param_num, param): + # customize the test name generator function as we want both params to appear in the sub-test + # name, as by default it shows only the first param + param_based_name = parameterized.to_safe_name("_to_".join(str(x) for x in param.args)) + return f"{func.__name__}_{param_based_name}" + +params = [ + ["1_1_1", "1_1_1"], + ["2_1_1", "1_1_1"], + ["1_2_1", "1_1_1"], + ["1_1_2", "1_1_1"], + + ["2_1_1", "2_1_1"], + ["1_1_1", "2_1_1"], + ["1_1_1", "1_2_1"], + ["1_1_1", "1_1_2"], + + ["1_1_2", "1_1_2"], + ["1_1_2", "2_1_1"], + ["1_1_2", "1_2_1"], + + ["1_2_1", "1_2_1"], + ["1_2_1", "2_1_1"], + ["1_2_1", "1_1_2"], + + ["2_1_1", "2_1_1"], + ["2_1_1", "1_2_1"], + ["2_1_1", "1_1_2"], + +] + def get_launcher(num_gpus): # 1. explicitly set --num_nodes=1 just in case these tests end up run on a multi-node setup # - it won't be able to handle that @@ -60,6 +92,11 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): exit_interval = 20 # some samples in the first half and then some more in the 2nd half after resume seq_len = 128 + # XXX: for now while testing shapes make it really short and fast + exit_interval = 1 + seq_len = 8 + + # common/shared configs ds_args = f""" @@ -75,9 +112,9 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): --distributed-backend nccl --log-interval 1 - --save-interval 20 + --save-interval 1 --eval-interval 10 - --eval-iters 5 + --eval-iters 1 --checkpoint-activations --partition-activations --exit-interval {exit_interval} @@ -94,10 +131,10 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): --log-validation-ppl-to-tensorboard --num-layers 2 - --hidden-size 64 + --hidden-size 8 --num-attention-heads 2 --seq-length {seq_len} - --max-position-embeddings 1024 + --max-position-embeddings 8 --micro-batch-size 1 --global-batch-size 16 --rampup-batch-size 2 2 {n_samples} @@ -139,7 +176,7 @@ def train_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1): launcher = get_launcher(num_gpus) cmd = launcher + script + args + ds_args # keep for quick debug - # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + #print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die # 1. test training from scratch (no checkpoint) with CaptureStdout() as cs: @@ -157,6 +194,19 @@ def train_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1): # test checkpoint saving self.assertIn("successfully saved checkpoint at iteration", cs.out) + def convert_checkpoint_to_universal(self, output_dir, step): + cmd = f""" + python tools/convert_checkpoint/ds_to_universal.py + --input_folder {output_dir}/checkpoints/global_step{step} + --output_folder {output_dir}/checkpoints/global_step{step}_universal + """.split() + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + + with CaptureStdout() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + + self.assertIn("Convert DeepSpeed Checkpoint to Universal Checkpoint", cs.out) def resume_from_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1): src_dir = self.src_dir @@ -180,37 +230,49 @@ def resume_from_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1): # test checkpoint saving self.assertIn("successfully saved checkpoint at iteration", cs.out) + def resume_from_universal_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_size=1): + src_dir = self.src_dir + script = [f"{src_dir}/pretrain_gpt.py"] - def reshape_checkpoint(self, input_dir, output_dir, target_tp_size, target_pp_size): - cmd = f""" - python tools/convert_checkpoint/deepspeed_to_deepspeed.py - --input_folder {input_dir}/checkpoints/global_step20 - --output_folder {output_dir}/checkpoints - --target_tp {target_tp_size} --target_pp {target_pp_size} - """.split() + args, ds_args, num_gpus = self.get_config(output_dir, tp_size, pp_size, dp_size) + launcher = get_launcher(num_gpus) + cmd = launcher + script + args + ds_args + ["--universal-checkpoint"] + # keep for quick debug + # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) - self.assertIn("Convert DeepSpeed Checkpoint to DeepSpeed Checkpoint", cs.out) + # test checkpoint loading + self.assertIn(f"successfully loaded checkpoint from {output_dir}/checkpoints", cs.out) + + # test reports + self.assertIn("consumed samples", cs.out) + # test checkpoint saving + self.assertIn("successfully saved checkpoint at iteration", cs.out) @require_torch_multi_gpu - def test_checkpoint_reshaping_tp2_pp1_dp1(self): + @parameterized.expand(params, name_func=parameterized_custom_name_func) + def test_checkpoint_reshaping_2gpus(self, src, dst): # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus - output_dir1 = self.get_auto_remove_tmp_dir("./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir("./xxx2", after=False) + src = list(map(int, src.split('_'))) + dst = list(map(int, dst.split('_'))) + #print(src, dst) + #die + + output_dir = self.get_auto_remove_tmp_dir("./xxx1", after=False) # 1. train with TP=2 / PP=1 - self.train_checkpoint(output_dir1, tp_size=2, pp_size=1, dp_size=1) + self.train_checkpoint(output_dir, tp_size=src[0], pp_size=src[1], dp_size=src[2]) # 2. convert checkpoint to TP=1 / PP=1 - self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + self.convert_checkpoint_to_universal(output_dir=output_dir, step=1) # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 - self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) + self.resume_from_universal_checkpoint(output_dir, tp_size=dst[0], pp_size=dst[1], dp_size=dst[2]) @require_torch_multi_gpu @@ -224,7 +286,7 @@ def test_checkpoint_reshaping_tp2_pp2_dp1(self): self.train_checkpoint(output_dir1, tp_size=2, pp_size=2, dp_size=1) # 2. convert checkpoint to TP=1 / PP=1 - self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + self.convert_checkpoint_to_universal(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) @@ -241,7 +303,7 @@ def test_checkpoint_reshaping_tp1_pp2_dp1(self): self.train_checkpoint(output_dir1, tp_size=1, pp_size=2, dp_size=1) # 2. convert checkpoint to TP=1 / PP=1 - self.reshape_checkpoint(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + self.convert_checkpoint_to_universal(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) @@ -254,4 +316,4 @@ def test_checkpoint_reshaping_empty_dir(self): output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) with self.assertRaises(RuntimeError) as context: - self.reshape_checkpoint(input_dir=output_dir1+"/xyz", output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + self.convert_checkpoint_to_universal(input_dir=output_dir1+"/xyz", output_dir=output_dir2, target_tp_size=1, target_pp_size=1) diff --git a/tools/convert_checkpoint/deepspeed_to_deepspeed.py b/tools/convert_checkpoint/deepspeed_to_deepspeed.py index f7605f56c..8d484e88d 100644 --- a/tools/convert_checkpoint/deepspeed_to_deepspeed.py +++ b/tools/convert_checkpoint/deepspeed_to_deepspeed.py @@ -5,7 +5,7 @@ import torch from pathlib import Path - + # insert megatron's root dir into sys.path root_repo_path = str(Path(__file__).resolve().parents[2]) if root_repo_path not in sys.path: @@ -13,14 +13,14 @@ from megatron.tokenizer.tokenizer import _vocab_size_with_padding from deepspeed.checkpoint.deepspeed_checkpoint import ( - ARGS_KEY, + ARGS_KEY, CHECKPOINT_INFO_KEY, ) from deepspeed.checkpoint import ( - DeepSpeedCheckpoint, - get_model_ckpt_name_for_rank, - get_zero_ckpt_name_for_rank, + DeepSpeedCheckpoint, + get_model_ckpt_name_for_rank, + get_zero_ckpt_name_for_rank, get_layer_ckpt_name_for_rank ) @@ -70,8 +70,8 @@ def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, p assert len(sd_list) == len(layer_id_list) for sd, layer_id in zip(sd_list, layer_id_list): ckpt_path = get_layer_ckpt_name_for_rank( - base_folder=base_folder, - layer_id=layer_id, + base_folder=base_folder, + layer_id=layer_id, tp_rank=tp_index) _save_checkpoint(ckpt_path, sd) @@ -93,8 +93,8 @@ def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index): sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY]) layer_id = ds_checkpoint.get_embedding_layer_id() ckpt_path = get_layer_ckpt_name_for_rank( - base_folder=base_folder, - tp_rank=tp_index, + base_folder=base_folder, + tp_rank=tp_index, layer_id=layer_id) _save_checkpoint(ckpt_path, sd) @@ -103,8 +103,8 @@ def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index): sd = ds_checkpoint.get_final_norm_state(tp_index) layer_id = ds_checkpoint.get_final_norm_layer_id() ckpt_path = get_layer_ckpt_name_for_rank( - base_folder=base_folder, - tp_rank=tp_index, + base_folder=base_folder, + tp_rank=tp_index, layer_id=layer_id) _save_checkpoint(ckpt_path, sd) @@ -132,7 +132,7 @@ def _create_zero_checkpoint(ds_checkpoint, base_folder, dp_index, pp_index, tp_i tp_index=tp_index, dp_index=dp_index) - ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder, + ckpt_path = get_zero_ckpt_name_for_rank(base_folder=base_folder, dp_rank=dp_index, mp_rank=_2d_rank) _save_checkpoint(ckpt_path, sd) @@ -154,7 +154,7 @@ def main(): ) ds_checkpoint = DeepSpeedCheckpoint( - args.input_folder, + args.input_folder, args.target_tp, args.target_pp, args.target_dp) @@ -172,7 +172,7 @@ def main(): for j in range(ds_checkpoint.pp_degree): _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j) _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j) - + for i in range(ds_checkpoint.dp_degree): for j in range(ds_checkpoint.pp_degree): for k in range(ds_checkpoint.tp_degree): diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index eddb22934..d568706b6 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -12,6 +12,7 @@ import shutil import sys import torch +import re # insert megatron's root dir into sys.path root_repo_path = str(Path(__file__).resolve().parents[2]) @@ -148,8 +149,8 @@ def extract_zero_shards(dir, slice_shapes, ds_checkpoint, pp_index, tp_index, dp for name,fragment_mapping in param_slice_mappings[param_group_id].items(): if "word_embeddings.weight" in name and pp_index > 0: - # Skip tied weights that are replicated in first and last pp stages - continue + # Skip tied weights that are replicated in first and last pp stages + continue print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}") for state_key in flat_state.keys(): @@ -191,9 +192,16 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): paths = sorted(list(glob.glob(f"{prefix_path}.0*"))) print(paths) shards = [torch.load(p) for p in paths] - slices.append(torch.cat(shards, dim=0).reshape(slice_shape)) + slice = torch.cat(shards, dim=0).reshape(slice_shape) + slices.append(slice) - return slices + return slices + + +ORIGINAL_VOCAB_SIZE = 'original_vocab_size' +def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor): + checkpoint_info = ds_checkpoint.get_checkpoint_info() + return padded_vocab_tensor.narrow(0, 0, checkpoint_info[ORIGINAL_VOCAB_SIZE]) WEIGHTS_TO_AVERAGE_PATTERNS = [ @@ -210,11 +218,16 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): ] WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [ - "dense_4h_to_h", + "dense_4h_to_h.weight", "self_attention.dense.weight", ] -def merge_tp_slices(dir, slice_dir, slice_shapes, tp_degree): + + + +def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): + + for name, shape in slice_shapes.items(): slice_base_path = os.path.join(slice_dir, name) @@ -228,44 +241,26 @@ def merge_tp_slices(dir, slice_dir, slice_shapes, tp_degree): for state in ("fp32", "exp_avg", "exp_avg_sq"): slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) final_path = os.path.join(param_base_path, f"{state}.pt") - - -# for state in ("fp32", "exp_avg", "exp_avg_sq"): -# final_path = os.path.join(param_base_path, f"{state}.pt") -# prefix_path = os.path.join(param_base_path, f"{state}") -# paths = sorted(list(glob.glob(f"{prefix_path}.0*"))) -# orig_paths = deepcopy(paths) - - - # XXX: tmp hack - need to deal with tied vars here -# if "word_embeddings.weight" in name and len(paths)>1: -# paths = paths[:1] - - -# print(paths) - -# shards = [torch.load(p) for p in paths] print(f"Expected shape: {shape}") print(f"Fragment sizes:", list(frag.shape for frag in slices)) - # merge - # XXX - Add merging strategy - import re - if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS): param = sum(slices) / len(slices) else: - cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - param = torch.cat(slices, dim=cat_dim) + cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 + print(f"CAT DIM: {cat_dim}") + param = torch.cat(slices, dim=cat_dim) + + if "word_embeddings.weight" in name: + print(f"Before {param.shape=}") + # strip padding + param = _strip_vocab_padding(ds_checkpoint, param) + print(f"After {param.shape=}") print(f"Final shape: {param.shape}") _save_checkpoint(final_path, param) - # XXX: probably not needed since torch.reshape would have failed if the inputs size was wrong - if param.shape != shape: - logging.error(f"✘ {name}: expected {shape} but got {param.shape}") - def main(): print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') @@ -300,8 +295,8 @@ def main(): print(f"{i=}, {j=}, {k=}") extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) - merge_tp_slices(args.output_folder, temp_dir, slice_shapes, ds_checkpoint.tp_degree) - + merge_tp_slices(ds_checkpoint, args.output_folder, temp_dir, slice_shapes, ds_checkpoint.tp_degree) + shutil.rmtree(temp_dir, ignore_errors=True) From 85ff56ca9f5d3c07ab32463f6c8b3d773cc98ffa Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 1 Jun 2022 04:04:54 +0500 Subject: [PATCH 18/33] param attrs --- tests/test_checkpoints.py | 4 ++++ tools/convert_checkpoint/ds_to_universal.py | 10 +++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 9603968e1..39c31afd7 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -60,6 +60,10 @@ def parameterized_custom_name_func(func, param_num, param): ["2_1_1", "1_2_1"], ["2_1_1", "1_1_2"], + ["2_2_2", "1_1_1"], + ["2_2_2", "2_2_2"], + ["1_1_1", "2_2_2"], + ] def get_launcher(num_gpus): diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index d568706b6..55c8db7ea 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -226,9 +226,6 @@ def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor): def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): - - - for name, shape in slice_shapes.items(): slice_base_path = os.path.join(slice_dir, name) param_base_path = os.path.join(dir, name) @@ -244,22 +241,25 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): print(f"Expected shape: {shape}") print(f"Fragment sizes:", list(frag.shape for frag in slices)) - + ckpt_dict = {} if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS): param = sum(slices) / len(slices) else: cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 print(f"CAT DIM: {cat_dim}") param = torch.cat(slices, dim=cat_dim) + ckpt_dict['cat_dim'] = cat_dim if "word_embeddings.weight" in name: print(f"Before {param.shape=}") # strip padding param = _strip_vocab_padding(ds_checkpoint, param) + ckpt_dict['tensor_to_pad'] = True print(f"After {param.shape=}") print(f"Final shape: {param.shape}") - _save_checkpoint(final_path, param) + ckpt_dict['param'] = param + _save_checkpoint(final_path, ckpt_dict) def main(): From f01fa4a5239e88933b82ea18604dbb6e498ac623 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 31 May 2022 17:16:01 -0700 Subject: [PATCH 19/33] make the tests adaptable to the number of available gpus --- tests/test_checkpoints.py | 70 ++++++++++++--------------------------- 1 file changed, 21 insertions(+), 49 deletions(-) diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 39c31afd7..51fa39438 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -14,6 +14,7 @@ import io import os +import pytest from pathlib import Path from parameterized import parameterized @@ -259,65 +260,36 @@ def resume_from_universal_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_ @require_torch_multi_gpu @parameterized.expand(params, name_func=parameterized_custom_name_func) - def test_checkpoint_reshaping_2gpus(self, src, dst): - # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus + def test_checkpoint_reshaping_main(self, src, tgt): + # this test needs at least 2 gpus - if there are more gpus it will do more extensive testing - src = list(map(int, src.split('_'))) - dst = list(map(int, dst.split('_'))) - #print(src, dst) - #die + tp_size_src, pp_size_src, dp_size_src = list(map(int, src.split('_'))) + tp_size_tgt, pp_size_tgt, dp_size_tgt = list(map(int, tgt.split('_'))) - output_dir = self.get_auto_remove_tmp_dir("./xxx1", after=False) + n_gpus = get_gpu_count() + n_gpus_src = tp_size_src * pp_size_src * dp_size_src + n_gpus_tgt = tp_size_tgt * pp_size_tgt * dp_size_tgt - # 1. train with TP=2 / PP=1 - self.train_checkpoint(output_dir, tp_size=src[0], pp_size=src[1], dp_size=src[2]) + if n_gpus_src > n_gpus: + pytest.skip(f"the test requires {n_gpus_src} gpus for source topology but have only {n_gpus}") + if n_gpus_tgt > n_gpus: + pytest.skip(f"the test requires {n_gpus_tgt} gpus for target topology but have only {n_gpus}") - # 2. convert checkpoint to TP=1 / PP=1 - self.convert_checkpoint_to_universal(output_dir=output_dir, step=1) - - # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 - self.resume_from_universal_checkpoint(output_dir, tp_size=dst[0], pp_size=dst[1], dp_size=dst[2]) - - - @require_torch_multi_gpu - def test_checkpoint_reshaping_tp2_pp2_dp1(self): - # this test requires at least 4 gpus - will use only 2 gpus for now - XXX: extend to more gpus - - output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) - - # 1. train with TP=2 / PP=2 - self.train_checkpoint(output_dir1, tp_size=2, pp_size=2, dp_size=1) + output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False) - # 2. convert checkpoint to TP=1 / PP=1 - self.convert_checkpoint_to_universal(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + # 1. train with initial topology defined in the first arg of params + self.train_checkpoint(output_dir, tp_size=tp_size_src , pp_size=pp_size_src , dp_size=dp_size_src ) - # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 - self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) - - - @require_torch_multi_gpu - def test_checkpoint_reshaping_tp1_pp2_dp1(self): - # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus - - output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) - - # 1. train with TP=1 / PP=2 - self.train_checkpoint(output_dir1, tp_size=1, pp_size=2, dp_size=1) - - # 2. convert checkpoint to TP=1 / PP=1 - self.convert_checkpoint_to_universal(input_dir=output_dir1, output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + # 2. convert checkpoint to universal checkpoint (topology ) + self.convert_checkpoint_to_universal(output_dir=output_dir, step=1) - # 3. check we can resume training from a reshaped checkpoint with TP=1 / PP=1 - self.resume_from_checkpoint(output_dir2, tp_size=1, pp_size=1, dp_size=1) + # 3. check we can resume training from a reshaped checkpoint to the target topology - the last arg of params + self.resume_from_universal_checkpoint(output_dir, tp_size=tp_size_tgt, pp_size=pp_size_tgt, dp_size=dp_size_tgt) @require_torch_multi_gpu def test_checkpoint_reshaping_empty_dir(self): - # this test requires at least 2 gpus - will use only 2 gpus for now - XXX: extend to more gpus - output_dir1 = self.get_auto_remove_tmp_dir() # "./xxx1", after=False) - output_dir2 = self.get_auto_remove_tmp_dir() # "./xxx2", after=False) + output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False) with self.assertRaises(RuntimeError) as context: - self.convert_checkpoint_to_universal(input_dir=output_dir1+"/xyz", output_dir=output_dir2, target_tp_size=1, target_pp_size=1) + self.convert_checkpoint_to_universal(output_dir=output_dir, step=1) From f29bacc1a6a7199682cf6dddb5f1cf8f2ea9cd04 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 2 Jun 2022 03:40:38 +0500 Subject: [PATCH 20/33] WIP --- tests/test_checkpoints.py | 2 +- tools/convert_checkpoint/ds_to_universal.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 51fa39438..9766f7572 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -243,7 +243,7 @@ def resume_from_universal_checkpoint(self, output_dir, tp_size=1, pp_size=1, dp_ launcher = get_launcher(num_gpus) cmd = launcher + script + args + ds_args + ["--universal-checkpoint"] # keep for quick debug - # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die + #print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die with CaptureStdout() as cs: execute_subprocess_async(cmd, env=self.get_env()) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 55c8db7ea..260f9c760 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -295,9 +295,18 @@ def main(): print(f"{i=}, {j=}, {k=}") extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) - merge_tp_slices(ds_checkpoint, args.output_folder, temp_dir, slice_shapes, ds_checkpoint.tp_degree) - + merge_tp_slices(ds_checkpoint, os.path.join(args.output_folder, "zero"), temp_dir, slice_shapes, ds_checkpoint.tp_degree) shutil.rmtree(temp_dir, ignore_errors=True) + + # Copy mp* files into output folder + for f in glob.glob(os.path.join(args.input_folder, 'mp*')): + shutil.copy2(f, args.output_folder) + + # Update latest to output folder + checkpoint_root_folder, step_folder = os.path.split(args.output_folder) + latest_file = os.path.join(checkpoint_root_folder, 'latest_universal') + with open(latest_file, "w") as f: + f.write(step_folder) if __name__ == "__main__": From dd0aeb67f6b0710dcfde9a6c127fa70460473bf6 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Thu, 2 Jun 2022 04:01:10 +0500 Subject: [PATCH 21/33] WIP --- tests/test_checkpoints.py | 5 ++++- tools/convert_checkpoint/ds_to_universal.py | 13 +++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/tests/test_checkpoints.py b/tests/test_checkpoints.py index 9766f7572..fdc41e014 100644 --- a/tests/test_checkpoints.py +++ b/tests/test_checkpoints.py @@ -39,6 +39,7 @@ def parameterized_custom_name_func(func, param_num, param): return f"{func.__name__}_{param_based_name}" params = [ + # TP_PP_DP ["1_1_1", "1_1_1"], ["2_1_1", "1_1_1"], ["1_2_1", "1_1_1"], @@ -65,6 +66,8 @@ def parameterized_custom_name_func(func, param_num, param): ["2_2_2", "2_2_2"], ["1_1_1", "2_2_2"], + ["1_1_8", "2_2_2"], + ] def get_launcher(num_gpus): @@ -142,7 +145,6 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): --max-position-embeddings 8 --micro-batch-size 1 --global-batch-size 16 - --rampup-batch-size 2 2 {n_samples} --train-samples {n_samples} --embed-layernorm @@ -163,6 +165,7 @@ def get_config(self, output_dir, tp_size, pp_size, dp_size): --log-level-replica info """.split() + # XXX: fails to handle: #--embed-layernorm # diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 260f9c760..e50ff5903 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -154,13 +154,13 @@ def extract_zero_shards(dir, slice_shapes, ds_checkpoint, pp_index, tp_index, dp print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}") for state_key in flat_state.keys(): - dump_param_fragment(dir, tp_index, state_key, flat_state[state_key], name, fragment_mapping.start, fragment_mapping.numel) + dump_param_fragment(dir, tp_index, dp_index, state_key, flat_state[state_key], name, fragment_mapping.start, fragment_mapping.numel) cnt = 0 -def dump_param_fragment(dir, tp_index, state_name, state_flat_tensor, param_name, offset, numel): +def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel): global cnt # temp hack @@ -168,7 +168,7 @@ def dump_param_fragment(dir, tp_index, state_name, state_flat_tensor, param_name os.makedirs(param_base_path, exist_ok=True) cnt += 1 - counter = f"{cnt:0>10d}" + counter = f"{dp_index:0>2d}" path = os.path.join(param_base_path, f"{state_name}.{counter}") @@ -178,13 +178,6 @@ def dump_param_fragment(dir, tp_index, state_name, state_flat_tensor, param_name _save_checkpoint(path, t) -def _cleanup_zero_shard_files(param_base_path, state, tp_degree): - for tp_index in range(tp_degree): - prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") - for p in sorted(list(glob.glob(f"{prefix_path}.0*"))): - os.unlink(p) - - def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): slices = [] for tp_index in range(tp_degree): From 3bf14fdf0dad8caa4cf70a172d4369acd75e5421 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 3 Jun 2022 02:48:32 +0500 Subject: [PATCH 22/33] WIP --- megatron/training.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/megatron/training.py b/megatron/training.py index 21e64638b..948e5c4a4 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -456,6 +456,25 @@ def setup_model_and_optimizer(model_provider_func): else: args.iteration = 0 + + # tp_rank = mpu.get_tensor_model_parallel_rank() + # pp_rank = mpu.get_pipeline_model_parallel_rank() + # dp_rank = mpu.get_data_parallel_rank() + # for n,p in model[0].named_parameters(): + # if 'word_embeddings.weight' not in n: + # continue + # if tp_rank == 0 and pp_rank == 0: + # print(f"{tp_rank=}{pp_rank=}{dp_rank=} bf16 {n=} {p[:10]=}") + # if p._hp_mapping is not None: + # hp = p._hp_mapping.hp_fragment + # print(f'{tp_rank=}{pp_rank=}{dp_rank=} fp32 {n=} {hp[:10]=}') + + # if tp_rank == 0 and pp_rank == mpu.get_pipeline_model_parallel_world_size() - 1: + # print(f"{tp_rank=}{pp_rank=}{dp_rank=} bf16 {n=} {p[:10]=}") + # if p._hp_mapping is not None: + # hp = p._hp_mapping.hp_fragment + # print(f'{tp_rank=}{pp_rank=}{dp_rank=} fp32 {n=} {hp[:10]=}') + # We only support local DDP with multiple micro-batches. if len(model) > 1 or mpu.get_pipeline_model_parallel_world_size() > 1: From 7ae002d3dddf9578ff60519d18cb0b16f0ab7ebe Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Fri, 3 Jun 2022 02:48:55 +0500 Subject: [PATCH 23/33] WIP --- run_bf16.sh | 40 ++++++---- run_universal_bf16.sh | 179 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+), 15 deletions(-) create mode 100755 run_universal_bf16.sh diff --git a/run_bf16.sh b/run_bf16.sh index 528b7576a..d0cd550d5 100755 --- a/run_bf16.sh +++ b/run_bf16.sh @@ -12,16 +12,21 @@ DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` #DATASET_3="" #DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" -BASE_DATA_PATH=tests/data/gpt2 -DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document -VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json -MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt +#BASE_DATA_PATH=tests/data/gpt2 +#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document +#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json +#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt + +BASE_DATA_PATH=/vc_data/Megatron-LM/data +DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron +VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json +MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt script_path=$(realpath $0) script_dir=$(dirname $script_path) -#CONFIG_JSON="$script_dir/ds_config.json" -CONFIG_JSON="/tmp/ds_config.json" +CONFIG_JSON="$script_dir/ds_config.json" +#CONFIG_JSON="/tmp/ds_config.json" USE_DEEPSPEED=1 ZERO_STAGE=0 @@ -30,31 +35,35 @@ ZERO_STAGE=0 #PP=4 # Debug -DEBUG_MODE=1 +DEBUG_MODE=0 if [[ $DEBUG_MODE == 1 ]]; then LAYERS=4 HIDDEN=512 SEQ=512 + EXIT_INTERVAL=3 else HIDDEN=1024 LAYERS=24 SEQ=1024 + EXIT_INTERVAL=10 fi TP=2 PP=2 -DP=2 +DP=4 WORLD_SIZE=$((TP*PP*DP)) -GLOBAL_BATCH=2 +GLOBAL_BATCH=4 MICRO_BATCH=1 +TRAIN_ITERS=10000 CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} +LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} LR=6.0e-4 MIN_LR=6.0e-5 DTYPE="bf16" -EXP_DIR=${HOME}/experiments/results/bf16 -LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_fix3" +EXP_DIR=${HOME}/experiments/results/ckpt_reshape +LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont_2" mkdir -p $LOG_DIR while [[ $# -gt 0 ]] @@ -90,7 +99,7 @@ options=" \ --max-position-embeddings $SEQ \ --micro-batch-size $MICRO_BATCH \ --global-batch-size $GLOBAL_BATCH \ - --train-iters 1000 \ + --train-iters $TRAIN_ITERS \ --lr $LR \ --min-lr $MIN_LR \ --lr-decay-style cosine \ @@ -100,7 +109,7 @@ options=" \ --data-path ${DATASET} \ --vocab-file ${VOCAB_PATH} \ --merge-file ${MERGE_PATH} \ - --save-interval 10000 \ + --save-interval 1000 \ --split 98,2,0 \ --clip-grad 1.0 \ --weight-decay 0.1 \ @@ -109,8 +118,9 @@ options=" \ --init-method-std 0.006 \ --${DTYPE} \ --checkpoint-activations \ - --exit-interval 3 \ + --exit-interval ${EXIT_INTERVAL} \ --save ${CHECKPOINT_PATH} \ + --load ${LOAD_CHECKPOINT_PATH} \ --position-embedding-type alibi \ --embed-layernorm \ --tensorboard-dir $LOG_DIR @@ -155,7 +165,7 @@ cat < $CONFIG_JSON } EOT -WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" +#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" #WORKER_STR="-i worker-0:0,1,2,3" #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}" #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}" diff --git a/run_universal_bf16.sh b/run_universal_bf16.sh new file mode 100755 index 000000000..b364ae161 --- /dev/null +++ b/run_universal_bf16.sh @@ -0,0 +1,179 @@ +#!/bin/bash + + +DIR=`pwd` +DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'` +#mkdir -p $DIR/logs +#mkdir -p /tmp/logs + + +#DATASET_1="" +#DATASET_2="" +#DATASET_3="" +#DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" + +#BASE_DATA_PATH=tests/data/gpt2 +#DATASET=${BASE_DATA_PATH}/meg-gpt2-openwebtext_text_document +#VOCAB_PATH=${BASE_DATA_PATH}/gpt2-tiny-vocab.json +#MERGE_PATH=${BASE_DATA_PATH}/gpt2-tiny-merges.txt + +BASE_DATA_PATH=/vc_data/Megatron-LM/data +DATASET=${BASE_DATA_PATH}/indexed_datasets/megatron +VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json +MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt + + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +CONFIG_JSON="$script_dir/ds_config.json" +#CONFIG_JSON="/tmp/ds_config.json" + +USE_DEEPSPEED=1 +ZERO_STAGE=0 + +#TP=4 +#PP=4 + +# Debug +DEBUG_MODE=0 +if [[ $DEBUG_MODE == 1 ]]; then + LAYERS=4 + HIDDEN=512 + SEQ=512 + EXIT_INTERVAL=3 +else + HIDDEN=1024 + LAYERS=24 + SEQ=1024 + EXIT_INTERVAL=10 +fi + +TP=2 +PP=2 +DP=4 +WORLD_SIZE=$((TP*PP*DP)) +GLOBAL_BATCH=4 + +MICRO_BATCH=1 +TRAIN_ITERS=10000 +CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} +LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp4 + +LR=6.0e-4 +MIN_LR=6.0e-5 +DTYPE="bf16" +EXP_DIR=${HOME}/experiments/results/ckpt_reshape +LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni_2" +mkdir -p $LOG_DIR + +while [[ $# -gt 0 ]] +do +key="$1" +case $key in + --no-deepspeed) + USE_DEEPSPEED=0; + shift + ;; + -z|--zero-stage) + ZERO_STAGE=$2; + shift + ;; + *) + echo "Unknown argument(s)" + usage + exit 1 + shift + ;; +esac +done + + +options=" \ + --tensor-model-parallel-size $TP \ + --pipeline-model-parallel-size $PP \ + --num-layers $LAYERS \ + --hidden-size $HIDDEN \ + --num-attention-heads 32 \ + --seq-length $SEQ \ + --loss-scale 12 \ + --max-position-embeddings $SEQ \ + --micro-batch-size $MICRO_BATCH \ + --global-batch-size $GLOBAL_BATCH \ + --train-iters $TRAIN_ITERS \ + --lr $LR \ + --min-lr $MIN_LR \ + --lr-decay-style cosine \ + --log-interval 1 \ + --eval-iters 40 \ + --eval-interval 10 \ + --data-path ${DATASET} \ + --vocab-file ${VOCAB_PATH} \ + --merge-file ${MERGE_PATH} \ + --save-interval 1000 \ + --split 98,2,0 \ + --clip-grad 1.0 \ + --weight-decay 0.1 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --init-method-std 0.006 \ + --${DTYPE} \ + --checkpoint-activations \ + --exit-interval ${EXIT_INTERVAL} \ + --save ${CHECKPOINT_PATH} \ + --load ${LOAD_CHECKPOINT_PATH} \ + --universal-checkpoint \ + --position-embedding-type alibi \ + --embed-layernorm \ + --tensorboard-dir $LOG_DIR + " + + +if [[ ${USE_DEEPSPEED} -eq 1 ]]; then + echo "Using DeepSpeed" + options="${options} \ + --deepspeed \ + --deepspeed_config=${CONFIG_JSON} \ + --zero-stage=${ZERO_STAGE} \ + --deepspeed-activation-checkpointing \ + " +fi + + +cat < $CONFIG_JSON +{ + "train_batch_size" : $GLOBAL_BATCH, + "train_micro_batch_size_per_gpu": $MICRO_BATCH, + "steps_per_print": 1, + + "zero_optimization": { + "stage": $ZERO_STAGE + }, + + "bf16": { + "enabled": true + }, + + "fp16": { + "enabled": false, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + + "wall_clock_breakdown" : true +} +EOT + +#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" +#WORKER_STR="-i worker-0:0,1,2,3" +#run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}" +#run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}" +run_cmd="deepspeed --master_port 29700 $WORKER_STR ${DIR}/pretrain_gpt.py $@ ${options}" + + +echo ${run_cmd} +eval ${run_cmd} + +set +x From 55bb5148039a052bfb6591ff5c892409fcf1df2e Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Sat, 4 Jun 2022 05:52:27 +0500 Subject: [PATCH 24/33] Debug functions --- megatron/training.py | 3 + megatron/utils.py | 76 +++++++++++++++++++++ run_bf16.sh | 9 +-- run_universal_bf16.sh | 9 +-- tools/convert_checkpoint/ds_to_universal.py | 4 +- 5 files changed, 92 insertions(+), 9 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 948e5c4a4..763f3132a 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -457,6 +457,9 @@ def setup_model_and_optimizer(model_provider_func): else: args.iteration = 0 + from .utils import dump_weights + dump_weights(f'{args.universal_checkpoint=}', args.iteration, model, optimizer) + # tp_rank = mpu.get_tensor_model_parallel_rank() # pp_rank = mpu.get_pipeline_model_parallel_rank() # dp_rank = mpu.get_data_parallel_rank() diff --git a/megatron/utils.py b/megatron/utils.py index 98d2f611c..fe0b09de5 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -392,3 +392,79 @@ def found_kill_switch(): return True else: return False + +def get_fingerprint_header(): + return f"{'min':^13} {'max':^13} {'mean':^13} {'l2 norm':^12} metadata" + +def get_fingerprint(p): + return f"{p.min():13.6e} {p.max():13.6e} {p.mean():13.6e} {p.norm():12.6e}" + + +def dump_weights(preamble, iteration, model, optimizer, tensor=None): + tp_rank = mpu.get_tensor_model_parallel_rank() + pp_rank = mpu.get_pipeline_model_parallel_rank() + dp_rank = mpu.get_data_parallel_rank() + dp_size = mpu.get_data_parallel_world_size() + fn = f"debug-bf16-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" + + # only care for first and last pp stages and dp0 tp0 + #if not (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()): + # return + + #if not (tp_rank == 0 and dp_rank == 0): + # return + + if tensor is not None: + orig_tensor = tensor + if hasattr(tensor, "_hp_param"): + numel = tensor._hp_param.numel() # // dp_size + tensor = tensor.flatten().narrow(0, 0, numel) + + #print(fn) + with open(fn, "w") as fh: + fh.write(f"{get_fingerprint_header()}\n") + + if tensor is not None: + fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n") + else: + for n, p in model[0].named_parameters(): + fh.write(f"{get_fingerprint(p)} {n} {p.shape}\n") + + + return + + + # until we figure out how to dump the actual fp32 values don't do this + fn = f"debug-fp32-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-{preamble}.txt" + with open(fn, "w") as fh: + fh.write(f"{get_fingerprint_header()}\n") + if tensor is not None: + tensor = orig_tensor + if hasattr(tensor, "_hp_param"): + fh.write(f"{get_fingerprint(tensor._hp_param)} tensor {tensor._hp_param.shape}\n") + #fh.write(f"{get_fingerprint(tensor._hp_grad)} tensor grad\n") + else: + fh.write(f"{get_fingerprint(tensor)} tensor {tensor.shape}\n") + #fh.write(f"{get_fingerprint(tensor.grad)} tensor grad\n") + + else: + if hasattr(model[0].module.tied_modules, "embed"): + p = model[0].module.tied_modules.embed.word_embeddings.weight._hp_param + fh.write(f"{get_fingerprint(p)} module.tied_modules.embed.word_embeddings.weight._hp_param {p.shape}\n") + + # for i, param_group in enumerate(optimizer.param_groups): + # fh.write(f"{get_fingerprint(optimizer.fp32_groups_flat_partition[i])} group={i}\n") + #fh.write(f"{i}={optimizer.fp32_groups_flat_partition[i]}\n") + # if mpu.is_pipeline_first_stage(): + # x = optimizer.fp32_groups_flat_partition[0] + # fh.write(f"fp32={x[:402432]}\n") + # if mpu.is_pipeline_last_stage()): + # x = optimizer.fp32_groups_flat_partition[1] + # fh.write(f"fp32={x[-402432:]}\n") + + # import os + # import socket + # hostname = socket.gethostname() + # pid = os.getpid() + # global_rank = torch.distributed.get_rank() + #fn = f"debug-{iteration}-pp{pp_rank}-tp{tp_rank}-dp{dp_rank}-global{global_rank}-{preamble}-{pid}.txt" \ No newline at end of file diff --git a/run_bf16.sh b/run_bf16.sh index d0cd550d5..d1bdb2996 100755 --- a/run_bf16.sh +++ b/run_bf16.sh @@ -55,15 +55,15 @@ WORLD_SIZE=$((TP*PP*DP)) GLOBAL_BATCH=4 MICRO_BATCH=1 -TRAIN_ITERS=10000 +TRAIN_ITERS=100000 CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} -LR=6.0e-4 -MIN_LR=6.0e-5 +LR=0 # 6.0e-4 +MIN_LR=0 # 6.0e-5 DTYPE="bf16" EXP_DIR=${HOME}/experiments/results/ckpt_reshape -LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont_2" +LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont" mkdir -p $LOG_DIR while [[ $# -gt 0 ]] @@ -122,6 +122,7 @@ options=" \ --save ${CHECKPOINT_PATH} \ --load ${LOAD_CHECKPOINT_PATH} \ --position-embedding-type alibi \ + --override-lr-scheduler \ --embed-layernorm \ --tensorboard-dir $LOG_DIR " diff --git a/run_universal_bf16.sh b/run_universal_bf16.sh index b364ae161..c2d0346a6 100755 --- a/run_universal_bf16.sh +++ b/run_universal_bf16.sh @@ -55,15 +55,15 @@ WORLD_SIZE=$((TP*PP*DP)) GLOBAL_BATCH=4 MICRO_BATCH=1 -TRAIN_ITERS=10000 +TRAIN_ITERS=100000 CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp4 -LR=6.0e-4 -MIN_LR=6.0e-5 +LR=0 # 6.0e-4 +MIN_LR=0 # 6.0e-5 DTYPE="bf16" EXP_DIR=${HOME}/experiments/results/ckpt_reshape -LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni_2" +LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni" mkdir -p $LOG_DIR while [[ $# -gt 0 ]] @@ -123,6 +123,7 @@ options=" \ --load ${LOAD_CHECKPOINT_PATH} \ --universal-checkpoint \ --position-embedding-type alibi \ + --override-lr-scheduler \ --embed-layernorm \ --tensorboard-dir $LOG_DIR " diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index e50ff5903..6ace88f8d 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -194,6 +194,8 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): ORIGINAL_VOCAB_SIZE = 'original_vocab_size' def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor): checkpoint_info = ds_checkpoint.get_checkpoint_info() + padding_tensor = padded_vocab_tensor.narrow(0, checkpoint_info[ORIGINAL_VOCAB_SIZE], padded_vocab_tensor.shape[0]-checkpoint_info[ORIGINAL_VOCAB_SIZE]) + print(f'{padded_vocab_tensor[checkpoint_info[ORIGINAL_VOCAB_SIZE]-3:,:]=}') return padded_vocab_tensor.narrow(0, 0, checkpoint_info[ORIGINAL_VOCAB_SIZE]) @@ -246,7 +248,7 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): if "word_embeddings.weight" in name: print(f"Before {param.shape=}") # strip padding - param = _strip_vocab_padding(ds_checkpoint, param) + #param = _strip_vocab_padding(ds_checkpoint, param) ckpt_dict['tensor_to_pad'] = True print(f"After {param.shape=}") From 795fedbb40b2f2e456839ab889a162a042904409 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 3 Jun 2022 18:01:31 -0700 Subject: [PATCH 25/33] args should be required, don't create another latest file --- tools/convert_checkpoint/ds_to_universal.py | 24 ++++++++++----------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 6ace88f8d..66e5b64f6 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -38,11 +38,9 @@ def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--input_folder', - default=None, type=str, help='Input DeepSpeed Checkpoint folder') parser.add_argument('--output_folder', - default=None, type=str, help='Output Megatron checkpoint folder') parser.add_argument('--target_tp', @@ -104,11 +102,11 @@ def _save_checkpoint(file_path, chkpt_sd): -def _create_latest_file(base_folder, iteration): - file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt') - os.makedirs(base_folder, exist_ok=True) - with open(file_path, 'w') as f: - f.write(str(iteration)) +# def _create_latest_file(base_folder, iteration): +# file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt') +# os.makedirs(base_folder, exist_ok=True) +# with open(file_path, 'w') as f: +# f.write(str(iteration)) # XXX: this is a temp hack that creates fake params but with the right shapes def save_params_universal(dir, slice_shapes): @@ -268,7 +266,7 @@ def main(): ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)#, 1, 2) # args.target_tp, args.target_pp) iteration = ds_checkpoint.get_iteration() - _create_latest_file(args.output_folder, iteration) + #_create_latest_file(args.output_folder, iteration) checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree, ds_checkpoint.pp_degree) @@ -290,14 +288,14 @@ def main(): print(f"{i=}, {j=}, {k=}") extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) - merge_tp_slices(ds_checkpoint, os.path.join(args.output_folder, "zero"), temp_dir, slice_shapes, ds_checkpoint.tp_degree) + merge_tp_slices(ds_checkpoint, os.path.join(args.output_folder, "zero"), temp_dir, slice_shapes, ds_checkpoint.tp_degree) shutil.rmtree(temp_dir, ignore_errors=True) - - # Copy mp* files into output folder + + # Copy mp* files into output folder for f in glob.glob(os.path.join(args.input_folder, 'mp*')): - shutil.copy2(f, args.output_folder) + shutil.copy2(f, args.output_folder) - # Update latest to output folder + # Update latest to output folder checkpoint_root_folder, step_folder = os.path.split(args.output_folder) latest_file = os.path.join(checkpoint_root_folder, 'latest_universal') with open(latest_file, "w") as f: From cc8810be186d06e68f4317bae863b2007da8ca96 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Tue, 7 Jun 2022 03:18:22 +0500 Subject: [PATCH 26/33] Parallelize shard extraction --- run_bf16.sh | 4 +- run_universal_bf16.sh | 8 ++-- tools/convert_checkpoint/ds_to_universal.py | 43 +++++++++++++++++---- 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/run_bf16.sh b/run_bf16.sh index d1bdb2996..fc884d4af 100755 --- a/run_bf16.sh +++ b/run_bf16.sh @@ -59,8 +59,8 @@ TRAIN_ITERS=100000 CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} -LR=0 # 6.0e-4 -MIN_LR=0 # 6.0e-5 +LR=6.0e-4 +MIN_LR=6.0e-5 DTYPE="bf16" EXP_DIR=${HOME}/experiments/results/ckpt_reshape LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_cont" diff --git a/run_universal_bf16.sh b/run_universal_bf16.sh index c2d0346a6..1f5f90d28 100755 --- a/run_universal_bf16.sh +++ b/run_universal_bf16.sh @@ -48,7 +48,7 @@ else EXIT_INTERVAL=10 fi -TP=2 +TP=1 PP=2 DP=4 WORLD_SIZE=$((TP*PP*DP)) @@ -59,8 +59,8 @@ TRAIN_ITERS=100000 CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp4 -LR=0 # 6.0e-4 -MIN_LR=0 # 6.0e-5 +LR=6.0e-4 +MIN_LR=6.0e-5 DTYPE="bf16" EXP_DIR=${HOME}/experiments/results/ckpt_reshape LOG_DIR="${EXP_DIR}/tensorboard/tp${TP}_pp${PP}_dp${DP}_hd${HIDDEN}_nl${LAYERS}_gbsz${GLOBAL_BATCH}_mbsz${MICRO_BATCH}_z${ZERO_STAGE}_LR_${LR}_${MIN_LR}_${DTYPE}_uni" @@ -167,7 +167,7 @@ cat < $CONFIG_JSON } EOT -#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" +WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" #WORKER_STR="-i worker-0:0,1,2,3" #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}" #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}" diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 66e5b64f6..d0a9e2f47 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -3,6 +3,7 @@ from collections import OrderedDict from copy import deepcopy from email.policy import default +import itertools from pathlib import Path from pprint import pprint import argparse @@ -13,6 +14,7 @@ import sys import torch import re +import multiprocessing # insert megatron's root dir into sys.path root_repo_path = str(Path(__file__).resolve().parents[2]) @@ -120,7 +122,8 @@ def save_params_universal(dir, slice_shapes): _save_checkpoint(path, param) -def extract_zero_shards(dir, slice_shapes, ds_checkpoint, pp_index, tp_index, dp_index): +def extract_zero_shards(dir, slice_shapes, ds_checkpoint, indices_3D): + pp_index, tp_index, dp_index = indices_3D sd = ds_checkpoint.get_zero_checkpoint_state( pp_index=pp_index, tp_index=tp_index, @@ -216,7 +219,12 @@ def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor): ] - +def _get_vocab_divisibility_padding_tensor(ds_checkpoint, padded_vocab_tensor): + checkpoint_info = ds_checkpoint.get_checkpoint_info() + if padded_vocab_tensor.shape[0] > checkpoint_info[ORIGINAL_VOCAB_SIZE]: + return padded_vocab_tensor[-1] + else: + return torch.zeros(padded_vocab_tensor.shape[1]) def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): for name, shape in slice_shapes.items(): @@ -247,7 +255,7 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): print(f"Before {param.shape=}") # strip padding #param = _strip_vocab_padding(ds_checkpoint, param) - ckpt_dict['tensor_to_pad'] = True + ckpt_dict['vocab_divisibility_padding_tensor'] = _get_vocab_divisibility_padding_tensor(ds_checkpoint, param) print(f"After {param.shape=}") print(f"Final shape: {param.shape}") @@ -255,6 +263,12 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): _save_checkpoint(final_path, ckpt_dict) + + +def _get_chunks(l, n): + for i in range(0, len(l), n): + yield l[i:i + n] + def main(): print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') @@ -282,11 +296,24 @@ def main(): # make fake params # save_params_universal(args.output_folder, slice_shapes) temp_dir = os.path.join(args.output_folder, 'tmp') - for i in range(ds_checkpoint.pp_degree): - for j in range(ds_checkpoint.tp_degree): - for k in range(ds_checkpoint.dp_degree): - print(f"{i=}, {j=}, {k=}") - extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) + _3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree))) + pprint(_3d_range_list) + num_workers = 6 + work_chunks = list(_get_chunks(_3d_range_list, num_workers)) + pprint(work_chunks) + + from functools import partial + calc_stuff = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint) + + pool = multiprocessing.Pool(num_workers) + for batch in work_chunks: + pool.map(calc_stuff, batch) + + # for i in range(ds_checkpoint.pp_degree): + # for j in range(ds_checkpoint.tp_degree): + # for k in range(ds_checkpoint.dp_degree): + # print(f"{i=}, {j=}, {k=}") + # extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) merge_tp_slices(ds_checkpoint, os.path.join(args.output_folder, "zero"), temp_dir, slice_shapes, ds_checkpoint.tp_degree) shutil.rmtree(temp_dir, ignore_errors=True) From 04d9ad0ff2aa2bfbaaa64d8c71613ca974a8d772 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 6 Jun 2022 20:25:11 -0700 Subject: [PATCH 27/33] close+join pool; add tqdm; comment out noise --- tools/convert_checkpoint/ds_to_universal.py | 47 ++++++++++++--------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index d0a9e2f47..722df6ef6 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -3,18 +3,20 @@ from collections import OrderedDict from copy import deepcopy from email.policy import default -import itertools +from functools import partial from pathlib import Path from pprint import pprint import argparse import glob +import itertools import logging +import multiprocessing import os +import re import shutil import sys import torch -import re -import multiprocessing +import tqdm # insert megatron's root dir into sys.path root_repo_path = str(Path(__file__).resolve().parents[2]) @@ -115,7 +117,7 @@ def save_params_universal(dir, slice_shapes): for name, shape in slice_shapes.items(): param_base_path = os.path.join(dir, name) os.makedirs(param_base_path, exist_ok=True) - print(f"{name}: {shape} => {param_base_path}") + #print(f"{name}: {shape} => {param_base_path}") for state in ("fp32", "exp_avg", "exp_avg_sq"): path = os.path.join(param_base_path, f"{state}.pt") param = torch.Tensor(shape) @@ -153,7 +155,7 @@ def extract_zero_shards(dir, slice_shapes, ds_checkpoint, indices_3D): # Skip tied weights that are replicated in first and last pp stages continue - print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}") + #print(f"{param_group_id} {name} => {fragment_mapping.start}:{fragment_mapping.numel}") for state_key in flat_state.keys(): dump_param_fragment(dir, tp_index, dp_index, state_key, flat_state[state_key], name, fragment_mapping.start, fragment_mapping.numel) @@ -173,7 +175,7 @@ def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, path = os.path.join(param_base_path, f"{state_name}.{counter}") - print(f"{param_name}: {offset}: {numel} => {path}") + #print(f"{param_name}: {offset}: {numel} => {path}") t = state_flat_tensor.narrow(0, offset, numel) _save_checkpoint(path, t) @@ -184,7 +186,7 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): for tp_index in range(tp_degree): prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}") paths = sorted(list(glob.glob(f"{prefix_path}.0*"))) - print(paths) + #print(paths) shards = [torch.load(p) for p in paths] slice = torch.cat(shards, dim=0).reshape(slice_shape) slices.append(slice) @@ -196,7 +198,7 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape): def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor): checkpoint_info = ds_checkpoint.get_checkpoint_info() padding_tensor = padded_vocab_tensor.narrow(0, checkpoint_info[ORIGINAL_VOCAB_SIZE], padded_vocab_tensor.shape[0]-checkpoint_info[ORIGINAL_VOCAB_SIZE]) - print(f'{padded_vocab_tensor[checkpoint_info[ORIGINAL_VOCAB_SIZE]-3:,:]=}') + #print(f'{padded_vocab_tensor[checkpoint_info[ORIGINAL_VOCAB_SIZE]-3:,:]=}') return padded_vocab_tensor.narrow(0, 0, checkpoint_info[ORIGINAL_VOCAB_SIZE]) @@ -230,7 +232,7 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): for name, shape in slice_shapes.items(): slice_base_path = os.path.join(slice_dir, name) param_base_path = os.path.join(dir, name) - print(f"\n{name}: {shape} => {slice_base_path} -----> {param_base_path}") + #print(f"\n{name}: {shape} => {slice_base_path} -----> {param_base_path}") # XXX: shouldn't be in the states if "position_embeddings" in name: @@ -240,25 +242,25 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) final_path = os.path.join(param_base_path, f"{state}.pt") - print(f"Expected shape: {shape}") - print(f"Fragment sizes:", list(frag.shape for frag in slices)) + #print(f"Expected shape: {shape}") + #print(f"Fragment sizes:", list(frag.shape for frag in slices)) ckpt_dict = {} if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS): param = sum(slices) / len(slices) else: cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - print(f"CAT DIM: {cat_dim}") + #print(f"CAT DIM: {cat_dim}") param = torch.cat(slices, dim=cat_dim) ckpt_dict['cat_dim'] = cat_dim if "word_embeddings.weight" in name: - print(f"Before {param.shape=}") + #print(f"Before {param.shape=}") # strip padding #param = _strip_vocab_padding(ds_checkpoint, param) ckpt_dict['vocab_divisibility_padding_tensor'] = _get_vocab_divisibility_padding_tensor(ds_checkpoint, param) - print(f"After {param.shape=}") + #print(f"After {param.shape=}") - print(f"Final shape: {param.shape}") + #print(f"Final shape: {param.shape}") ckpt_dict['param'] = param _save_checkpoint(final_path, ckpt_dict) @@ -297,17 +299,19 @@ def main(): # save_params_universal(args.output_folder, slice_shapes) temp_dir = os.path.join(args.output_folder, 'tmp') _3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree))) - pprint(_3d_range_list) + #pprint(_3d_range_list) num_workers = 6 - work_chunks = list(_get_chunks(_3d_range_list, num_workers)) - pprint(work_chunks) + work_chunks = list(_get_chunks(_3d_range_list, num_workers)) + #pprint(work_chunks) - from functools import partial calc_stuff = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint) + print('*** 1. Extracting ZeRO fragments') pool = multiprocessing.Pool(num_workers) - for batch in work_chunks: + for batch in tqdm.tqdm(work_chunks): pool.map(calc_stuff, batch) + pool.close() + pool.join() # for i in range(ds_checkpoint.pp_degree): # for j in range(ds_checkpoint.tp_degree): @@ -315,6 +319,7 @@ def main(): # print(f"{i=}, {j=}, {k=}") # extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) + print('*** 2. Merging slices') merge_tp_slices(ds_checkpoint, os.path.join(args.output_folder, "zero"), temp_dir, slice_shapes, ds_checkpoint.tp_degree) shutil.rmtree(temp_dir, ignore_errors=True) @@ -328,6 +333,8 @@ def main(): with open(latest_file, "w") as f: f.write(step_folder) + print('*** Done!') + if __name__ == "__main__": main() From bca5af4e50133fbe7e40a7d5642fbf7ae019dfea Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 6 Jun 2022 20:32:00 -0700 Subject: [PATCH 28/33] rename --- tools/convert_checkpoint/ds_to_universal.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 722df6ef6..e0e39e6af 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -304,12 +304,12 @@ def main(): work_chunks = list(_get_chunks(_3d_range_list, num_workers)) #pprint(work_chunks) - calc_stuff = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint) + do_work = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint) print('*** 1. Extracting ZeRO fragments') pool = multiprocessing.Pool(num_workers) for batch in tqdm.tqdm(work_chunks): - pool.map(calc_stuff, batch) + pool.map(do_work, batch) pool.close() pool.join() From 721380b2802032c459665302ffe51a042d802c0c Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 6 Jun 2022 20:35:52 -0700 Subject: [PATCH 29/33] parameterize --- tools/convert_checkpoint/ds_to_universal.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index e0e39e6af..144dea1f1 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -55,6 +55,10 @@ def parse_arguments(): default=1, type=int, help='Target PP degree') + parser.add_argument('--num_workers', + default=4, + type=int, + help='How many parallel processes to use') parser.add_argument( '--for_release', action='store_true', @@ -300,14 +304,13 @@ def main(): temp_dir = os.path.join(args.output_folder, 'tmp') _3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree))) #pprint(_3d_range_list) - num_workers = 6 - work_chunks = list(_get_chunks(_3d_range_list, num_workers)) + work_chunks = list(_get_chunks(_3d_range_list, args.num_workers)) #pprint(work_chunks) do_work = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint) print('*** 1. Extracting ZeRO fragments') - pool = multiprocessing.Pool(num_workers) + pool = multiprocessing.Pool(args.num_workers) for batch in tqdm.tqdm(work_chunks): pool.map(do_work, batch) pool.close() From e8a1ccf1b8aa5b98b7f7cfc804e0d62e26b052f4 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 8 Jun 2022 00:08:39 +0500 Subject: [PATCH 30/33] Parallel slice merging --- run_universal_bf16.sh | 6 +- tools/convert_checkpoint/ds_to_universal.py | 122 +++++++++++--------- 2 files changed, 71 insertions(+), 57 deletions(-) diff --git a/run_universal_bf16.sh b/run_universal_bf16.sh index 1f5f90d28..e06f242a6 100755 --- a/run_universal_bf16.sh +++ b/run_universal_bf16.sh @@ -45,10 +45,10 @@ else HIDDEN=1024 LAYERS=24 SEQ=1024 - EXIT_INTERVAL=10 + EXIT_INTERVAL=10000 fi -TP=1 +TP=2 PP=2 DP=4 WORLD_SIZE=$((TP*PP*DP)) @@ -167,7 +167,7 @@ cat < $CONFIG_JSON } EOT -WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" +#WORKER_STR="--num_nodes 1 --num_gpus $WORLD_SIZE" #WORKER_STR="-i worker-0:0,1,2,3" #run_cmd="deepspeed -i worker-0:0,1,2,3 ${DIR}/pretrain_gpt.py $@ ${options}" #run_cmd="deepspeed -i worker-0 ${DIR}/pretrain_gpt.py $@ ${options}" diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 144dea1f1..8609a8f4c 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -55,10 +55,14 @@ def parse_arguments(): default=1, type=int, help='Target PP degree') - parser.add_argument('--num_workers', + parser.add_argument('--num_extract_workers', default=4, type=int, - help='How many parallel processes to use') + help='How many parallel processes to extract zero shards') + parser.add_argument('--num_merge_workers', + default=2, + type=int, + help='How many parallel processes to merge tp slices (more memory intensive, use much fewer than --num_extract_workers))') parser.add_argument( '--for_release', action='store_true', @@ -135,7 +139,7 @@ def extract_zero_shards(dir, slice_shapes, ds_checkpoint, indices_3D): tp_index=tp_index, dp_index=dp_index) - pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}") + #pprint(f"Processing {dp_index=} {pp_index=}, {tp_index=}") optim_sd = sd["optimizer_state_dict"] param_slice_mappings = optim_sd["param_slice_mappings"] @@ -232,41 +236,41 @@ def _get_vocab_divisibility_padding_tensor(ds_checkpoint, padded_vocab_tensor): else: return torch.zeros(padded_vocab_tensor.shape[1]) -def merge_tp_slices(ds_checkpoint, dir, slice_dir, slice_shapes, tp_degree): - for name, shape in slice_shapes.items(): - slice_base_path = os.path.join(slice_dir, name) - param_base_path = os.path.join(dir, name) - #print(f"\n{name}: {shape} => {slice_base_path} -----> {param_base_path}") +def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape): + name, shape = name_and_shape + # XXX: shouldn't be in the states +# if "position_embeddings" in name: +# return + slice_base_path = os.path.join(slice_dir, name) + param_base_path = os.path.join(dir, name) - # XXX: shouldn't be in the states - if "position_embeddings" in name: - continue + for state in ("fp32", "exp_avg", "exp_avg_sq"): + slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) + final_path = os.path.join(param_base_path, f"{state}.pt") - for state in ("fp32", "exp_avg", "exp_avg_sq"): - slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape) - final_path = os.path.join(param_base_path, f"{state}.pt") + #print(f"Expected shape: {shape}") + #print(f"Fragment sizes:", list(frag.shape for frag in slices)) + ckpt_dict = {} + if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS): + param = sum(slices) / len(slices) + else: + cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 + #print(f"CAT DIM: {cat_dim}") + param = torch.cat(slices, dim=cat_dim) + ckpt_dict['cat_dim'] = cat_dim + + if "word_embeddings.weight" in name: + #print(f"Before {param.shape=}") + # strip padding + #param = _strip_vocab_padding(ds_checkpoint, param) + ckpt_dict['vocab_divisibility_padding_tensor'] = _get_vocab_divisibility_padding_tensor(ds_checkpoint, param) + #print(f"After {param.shape=}") - #print(f"Expected shape: {shape}") - #print(f"Fragment sizes:", list(frag.shape for frag in slices)) - ckpt_dict = {} - if any(re.match(pattern, name) for pattern in WEIGHTS_TO_AVERAGE_PATTERNS): - param = sum(slices) / len(slices) - else: - cat_dim = 1 if any(text in name for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 - #print(f"CAT DIM: {cat_dim}") - param = torch.cat(slices, dim=cat_dim) - ckpt_dict['cat_dim'] = cat_dim + #print(f"Final shape: {param.shape}") + ckpt_dict['param'] = param + _save_checkpoint(final_path, ckpt_dict) - if "word_embeddings.weight" in name: - #print(f"Before {param.shape=}") - # strip padding - #param = _strip_vocab_padding(ds_checkpoint, param) - ckpt_dict['vocab_divisibility_padding_tensor'] = _get_vocab_divisibility_padding_tensor(ds_checkpoint, param) - #print(f"After {param.shape=}") - #print(f"Final shape: {param.shape}") - ckpt_dict['param'] = param - _save_checkpoint(final_path, ckpt_dict) @@ -275,6 +279,34 @@ def _get_chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] + +def _do_parallel_work(do_work, work_chunks, num_workers): + pool = multiprocessing.Pool(num_workers) + for batch in tqdm.tqdm(work_chunks): + pool.map(do_work, batch) + pool.close() + pool.join() + +def _extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir): + _3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree))) + #pprint(_3d_range_list) + work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers)) + #pprint(work_chunks) + + do_work = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint) + _do_parallel_work(do_work, work_chunks, args.num_extract_workers) + + + +def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir): + work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers)) + #pprint(work_chunks) + zero_output_folder = os.path.join(args.output_folder, "zero") + do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree) + _do_parallel_work(do_work, work_chunks, args.num_merge_workers) + + + def main(): print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint') @@ -298,32 +330,14 @@ def main(): # fix back to normal flat dict, merge duplicates for tp>1 slice_shapes = dict((k,v) for d in slice_shapes for k,v in d.items() ) - - # make fake params - # save_params_universal(args.output_folder, slice_shapes) temp_dir = os.path.join(args.output_folder, 'tmp') - _3d_range_list = list(itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree), range(ds_checkpoint.dp_degree))) - #pprint(_3d_range_list) - work_chunks = list(_get_chunks(_3d_range_list, args.num_workers)) - #pprint(work_chunks) - - do_work = partial(extract_zero_shards, temp_dir, slice_shapes, ds_checkpoint) print('*** 1. Extracting ZeRO fragments') - pool = multiprocessing.Pool(args.num_workers) - for batch in tqdm.tqdm(work_chunks): - pool.map(do_work, batch) - pool.close() - pool.join() - - # for i in range(ds_checkpoint.pp_degree): - # for j in range(ds_checkpoint.tp_degree): - # for k in range(ds_checkpoint.dp_degree): - # print(f"{i=}, {j=}, {k=}") - # extract_zero_shards(temp_dir, slice_shapes, ds_checkpoint, i, j, k) + _extract_zero_shard_files(args, ds_checkpoint, slice_shapes, temp_dir) print('*** 2. Merging slices') - merge_tp_slices(ds_checkpoint, os.path.join(args.output_folder, "zero"), temp_dir, slice_shapes, ds_checkpoint.tp_degree) + _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir) + shutil.rmtree(temp_dir, ignore_errors=True) # Copy mp* files into output folder From a247614b2fa41326ac8482a1735df867778cec85 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Wed, 8 Jun 2022 00:59:18 +0500 Subject: [PATCH 31/33] Cleanup --- run_universal_bf16.sh | 4 ++-- tools/convert_checkpoint/ds_to_universal.py | 21 --------------------- 2 files changed, 2 insertions(+), 23 deletions(-) diff --git a/run_universal_bf16.sh b/run_universal_bf16.sh index e06f242a6..7a60c34c1 100755 --- a/run_universal_bf16.sh +++ b/run_universal_bf16.sh @@ -45,7 +45,7 @@ else HIDDEN=1024 LAYERS=24 SEQ=1024 - EXIT_INTERVAL=10000 + EXIT_INTERVAL=10 fi TP=2 @@ -57,7 +57,7 @@ GLOBAL_BATCH=4 MICRO_BATCH=1 TRAIN_ITERS=100000 CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp${DP} -LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp${TP}_pp${PP}_dp4 +LOAD_CHECKPOINT_PATH=checkpoints/gpt2/tp2_pp2_dp4 LR=6.0e-4 MIN_LR=6.0e-5 diff --git a/tools/convert_checkpoint/ds_to_universal.py b/tools/convert_checkpoint/ds_to_universal.py index 8609a8f4c..9a5dd1154 100755 --- a/tools/convert_checkpoint/ds_to_universal.py +++ b/tools/convert_checkpoint/ds_to_universal.py @@ -114,24 +114,6 @@ def _save_checkpoint(file_path, chkpt_sd): -# def _create_latest_file(base_folder, iteration): -# file_path = os.path.join(base_folder, 'latest_checkpointed_iteration.txt') -# os.makedirs(base_folder, exist_ok=True) -# with open(file_path, 'w') as f: -# f.write(str(iteration)) - -# XXX: this is a temp hack that creates fake params but with the right shapes -def save_params_universal(dir, slice_shapes): - for name, shape in slice_shapes.items(): - param_base_path = os.path.join(dir, name) - os.makedirs(param_base_path, exist_ok=True) - #print(f"{name}: {shape} => {param_base_path}") - for state in ("fp32", "exp_avg", "exp_avg_sq"): - path = os.path.join(param_base_path, f"{state}.pt") - param = torch.Tensor(shape) - _save_checkpoint(path, param) - - def extract_zero_shards(dir, slice_shapes, ds_checkpoint, indices_3D): pp_index, tp_index, dp_index = indices_3D sd = ds_checkpoint.get_zero_checkpoint_state( @@ -238,9 +220,6 @@ def _get_vocab_divisibility_padding_tensor(ds_checkpoint, padded_vocab_tensor): def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape): name, shape = name_and_shape - # XXX: shouldn't be in the states -# if "position_embeddings" in name: -# return slice_base_path = os.path.join(slice_dir, name) param_base_path = os.path.join(dir, name) From c14df232a6d6eccdf4468d640e5f9b2f7844cd9d Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 23 Jun 2022 13:59:39 +0200 Subject: [PATCH 32/33] Add many-to-many non-ZeRO reshaping --- .../deepspeed_to_deepspeed_nozero.py | 276 ++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py diff --git a/tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py b/tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py new file mode 100644 index 000000000..bb8f62d88 --- /dev/null +++ b/tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py @@ -0,0 +1,276 @@ +""" +Many-to-one reshaping for checkpoints trained without ZeRO + +Usage example: +python tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py --input_folder ../global_step156000_old --output_folder ../global_step156000_tp2pp2 --target_tp 2 --target_pp 2 + +Snippet for manual testing in a python environment: +from tools.convert_checkpoint.deepspeed_to_deepspeed_nozero import DeepSpeedCheckpointNoZeRO +ds_checkpoint = DeepSpeedCheckpointNoZeRO('../global_step156000_old',1,4) +sd = ds_checkpoint.get_embedding_state(0) +sd['word_embeddings.weight'].shape + +Notes: +- You need changes from https://github.com/microsoft/DeepSpeed/pull/1953/ for this script to work +- There is a bug in PP layers in the above PR, so you may unexpectedly loose layers if reshaping to PP>1, see https://github.com/microsoft/DeepSpeed/pull/1953/files#r904886186 +- Only tested on FP16 Checkpoints +- Data Parallelism is irrelevant here, as it does not influence non-ZeRO ckpts. +""" + +import argparse +import os +from pathlib import Path +import sys + +import torch + +# Insert megatron's root dir into sys.path +root_repo_path = str(Path(__file__).resolve().parents[2]) +if root_repo_path not in sys.path: + sys.path.insert(0, root_repo_path) + + +from megatron.tokenizer.tokenizer import _vocab_size_with_padding + +from deepspeed.checkpoint import ( + DeepSpeedCheckpoint, + get_model_ckpt_name_for_rank, + get_layer_ckpt_name_for_rank, +) +from deepspeed.checkpoint.deepspeed_checkpoint import ( + ARGS_KEY, + CHECKPOINT_INFO_KEY, + EMBEDDING_LAYER_INDEX, + FINAL_LAYER_NORM_INDEX, + SEQUENTIAL_LAYERS, + LAYER_CONCAT_DIM +) +from deepspeed.checkpoint.reshape_meg_2d import reshape_meg_2d_parallel, meg_2d_parallel_map +from deepspeed.checkpoint.reshape_utils import (get_files, get_files_with_prefix) +from deepspeed.checkpoint.constants import (LAYER_FILE_PREFIX, MODEL_FILE_PREFIX) + + +# Add layers that should not be concatted +# The below are just copies across tp parallel files, thus we do not need to merge them +SEQUENTIAL_LAYERS.append('word_embeddings.norm.weight') +SEQUENTIAL_LAYERS.append('word_embeddings.norm.bias') + +class DeepSpeedCheckpointNoZeRO(DeepSpeedCheckpoint): + def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None): + self.dir = dir + self._validate_folder(dir) + + self.file_list = get_files(dir) + self.zero_files = [] #get_files_with_prefix(self.file_list, ZERO_FILE_PREFIX) + self.layer_files = get_files_with_prefix(self.file_list, LAYER_FILE_PREFIX) + self.mp_rank_files = get_files_with_prefix(self.file_list, MODEL_FILE_PREFIX) + + self.layer_keys = self._get_layer_keys() + self.layer_count = len(self.layer_keys) + self.original_tp_degree = len( + get_files_with_prefix(self.layer_files, + f'{LAYER_FILE_PREFIX}01')) + self.original_pp_degree = len(self.mp_rank_files) // self.original_tp_degree + self.original_dp_degree = max( + 1, + len(self.zero_files) // (self.original_pp_degree * self.original_tp_degree)) + + self.tp_degree = self.original_tp_degree if tp_degree is None else tp_degree + self.pp_degree = self.original_pp_degree if pp_degree is None else pp_degree + self.dp_degree = self.original_dp_degree if dp_degree is None else dp_degree + + self.original_world_size = self.original_tp_degree * self.original_pp_degree * self.original_dp_degree + self.world_size = self.tp_degree * self.pp_degree # * self.dp_degree + + self.old_2d_map = meg_2d_parallel_map(self.original_pp_degree, + self.original_tp_degree) + self.old_2d_map.simple_init() + self.new_2d_map = reshape_meg_2d_parallel(old_pp_degree=self.original_pp_degree, + old_tp_degree=self.original_tp_degree, + new_pp_degree=self.pp_degree, + new_tp_degree=self.tp_degree) + + # No ZeRO Checkpoint + # self.zero_checkpoint = ZeROCheckpoint(dir) + # if self.is_change_pp_degree() or self.is_change_tp_degree( + # ) or self.is_change_dp_degree(): + # self.zero_checkpoint.reshape( + # model_3d_desc(self.pp_degree, + # self.tp_degree, + # self.dp_degree)) + + self.global_state = {} + + self._sanity_check() + self.pp_to_transformer_map = self._build_pp_transformer_map() + self.transformer_file_map = self._build_transformer_file_map() + self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX) + self.tp_to_final_norm_map = self._build_tp_other_layer_map( + FINAL_LAYER_NORM_INDEX) + self._build_global_state() + + # Overwrite _merge_state_dicts to include additional SEQUENTIAL_LAYERS + def _merge_state_dicts(self, sd_list): + merged_sd = {} + for key in sd_list[0].keys(): + if not key in SEQUENTIAL_LAYERS: + cat_dim = LAYER_CONCAT_DIM.get(key, 0) + merged_sd[key] = torch.cat([sd[key] for sd in sd_list], dim=cat_dim) + else: + merged_sd[key] = sd_list[0][key] + + return merged_sd + +CHECKPOINT_FILE_SUFFIX = '_model_states.pt' +MP_WORLD_SIZE ='mp_world_size' +WORD_EMBEDDINGS_KEY = 'word_embeddings.weight' +ORIGINAL_VOCAB_SIZE = 'original_vocab_size' +PADDED_VOCAB_SIZE = 'padded_vocab_size' + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument('--input_folder', + default=None, + type=str, + help='Input DeepSpeed Checkpoint folder') + parser.add_argument('--output_folder', + default=None, + type=str, + help='Output Megatron checkpoint folder') + parser.add_argument('--target_tp', + default=None, + type=int, + help='Target TP degree') + parser.add_argument('--target_pp', + default=None, + type=int, + help='Target PP degree') + parser.add_argument('--original_vocab_size', + default=250680, + type=int, + help='Vocab Size prior to padding; Default is for BLOOM models for which after padding it is commonly 250880') + args = parser.parse_args() + print(f'args = {args}') + return args + + +def _save_checkpoint(file_path, chkpt_sd): + dir, _ = os.path.split(file_path) + os.makedirs(dir, exist_ok=True) + torch.save(chkpt_sd, file_path) + + +def _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, tp_index, pp_index): + sd_list = ds_checkpoint.get_transformer_state(tp_index, pp_index) + layer_id_list = ds_checkpoint.get_pp_transformer_map(pp_index) + assert len(sd_list) == len(layer_id_list) + for sd, layer_id in zip(sd_list, layer_id_list): + ckpt_path = get_layer_ckpt_name_for_rank( + base_folder=base_folder, + layer_id=layer_id, + tp_rank=tp_index) + _save_checkpoint(ckpt_path, sd) + + +def _strip_vocab_padding(ds_checkpoint, padded_vocab_tensor, original_vocab_size=250680): + target_args = ds_checkpoint.get_args() + checkpoint_info = None + if hasattr(ds_checkpoint, "get_checkpoint_info"): + checkpoint_info = ds_checkpoint.get_checkpoint_info() + if checkpoint_info is not None and ORIGINAL_VOCAB_SIZE in checkpoint_info: + original_vocab_size = checkpoint_info[ORIGINAL_VOCAB_SIZE] + target_args.tensor_model_parallel_size = ds_checkpoint.tp_degree + target_args.padded_vocab_size = _vocab_size_with_padding(original_vocab_size, target_args) + assert target_args.padded_vocab_size <= padded_vocab_tensor.numel() + if checkpoint_info is not None: + checkpoint_info[PADDED_VOCAB_SIZE] = target_args.padded_vocab_size + # Need to divide by ds_checkpoint.tp_degree to allow many-to-many reshaping e.g. from TP=4 -> TP=2 + # This is because the vocab tensor will be split across tp dimensions + unpadded_vocab_tensor = torch.narrow(padded_vocab_tensor, 0, 0, target_args.padded_vocab_size // ds_checkpoint.tp_degree) + return unpadded_vocab_tensor.clone() + + +def _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, tp_index, original_vocab_size=250680): + sd = ds_checkpoint.get_embedding_state(tp_index) + if ds_checkpoint.is_change_tp_degree(): + sd[WORD_EMBEDDINGS_KEY] = _strip_vocab_padding(ds_checkpoint, sd[WORD_EMBEDDINGS_KEY], + original_vocab_size=original_vocab_size) + layer_id = ds_checkpoint.get_embedding_layer_id() + ckpt_path = get_layer_ckpt_name_for_rank( + base_folder=base_folder, + tp_rank=tp_index, + layer_id=layer_id) + _save_checkpoint(ckpt_path, sd) + + +def _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, tp_index): + sd = ds_checkpoint.get_final_norm_state(tp_index) + layer_id = ds_checkpoint.get_final_norm_layer_id() + ckpt_path = get_layer_ckpt_name_for_rank( + base_folder=base_folder, + tp_rank=tp_index, + layer_id=layer_id) + _save_checkpoint(ckpt_path, sd) + + +def _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, tp_index, + pp_index): + sd = ds_checkpoint.get_2d_parallel_state(tp_index=tp_index, + pp_index=pp_index) + + # The above merged all tensors including random states tensors + # We just choose the first one as we cannot reuse them all + # This is the same way it is done for NumPy random states which are ignored in the above + fname_list = ds_checkpoint.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index) + first_sd = torch.load(fname_list[0], map_location=torch.device('cpu')) + sd['cuda_rng_state'] = first_sd['cuda_rng_state'] + sd['torch_rng_state'] = first_sd['torch_rng_state'] + sd['rng_tracker_states']['model-parallel-rng'] = first_sd['rng_tracker_states']['model-parallel-rng'] + + # DeepSpeed sets the MP_WORLD_SIZE to the size of all non-data-parallel gpus + sd[MP_WORLD_SIZE] = ds_checkpoint.tp_degree * ds_checkpoint.pp_degree + file_id = pp_index * ds_checkpoint.tp_degree + tp_index + ckpt_path = get_model_ckpt_name_for_rank(base_folder, f'{file_id:02d}') + + # Adjust specific fields + sd[ARGS_KEY] = ds_checkpoint.get_args() + sd[ARGS_KEY].tensor_model_parallel_size = ds_checkpoint.tp_degree + sd[ARGS_KEY].pipeline_model_parallel_size = ds_checkpoint.pp_degree + if CHECKPOINT_INFO_KEY in sd: + sd[CHECKPOINT_INFO_KEY][PADDED_VOCAB_SIZE] = sd[ARGS_KEY].padded_vocab_size + _save_checkpoint(ckpt_path, sd) + +def _create_latest_file(base_folder, file_name, latest_tag): + file_path = os.path.join(base_folder, file_name) + os.makedirs(base_folder, exist_ok=True) + with open(file_path, 'w') as f: + f.write(str(latest_tag)) + +def main(): + args = parse_arguments() + print( + f'Converting DeepSpeed checkpoint in {args.input_folder} to DeepSpeed checkpoint in {args.output_folder}' + ) + + ds_checkpoint = DeepSpeedCheckpointNoZeRO( + args.input_folder, + args.target_tp, + args.target_pp) + iteration = ds_checkpoint.get_iteration() + latest_tag = f'global_step{iteration}' + _create_latest_file(args.output_folder, + 'latest_checkpointed_iteration.txt', iteration) + _create_latest_file(args.output_folder, 'latest', latest_tag) + base_folder = os.path.join(args.output_folder, latest_tag) + + for i in range(ds_checkpoint.tp_degree): + _create_embedding_layer_checkpoint(ds_checkpoint, base_folder, i, original_vocab_size=args.original_vocab_size) + _create_final_norm_layer_checkpoint(ds_checkpoint, base_folder, i) + + for j in range(ds_checkpoint.pp_degree): + _create_transformer_layer_checkpoint(ds_checkpoint, base_folder, i, j) + _create_2d_parallel_checkpoint(ds_checkpoint, base_folder, i, j) + +if __name__ == "__main__": + main() + From e8ba66af18badd7ac8327965f2d9de8f0b0ab53b Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 23 Jun 2022 14:03:15 +0200 Subject: [PATCH 33/33] Explain many-to-many status --- tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py b/tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py index bb8f62d88..c9c0c92fc 100644 --- a/tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py +++ b/tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py @@ -1,5 +1,5 @@ """ -Many-to-one reshaping for checkpoints trained without ZeRO +Many-to-many reshaping for checkpoints trained without ZeRO Usage example: python tools/convert_checkpoint/deepspeed_to_deepspeed_nozero.py --input_folder ../global_step156000_old --output_folder ../global_step156000_tp2pp2 --target_tp 2 --target_pp 2 @@ -14,6 +14,7 @@ - You need changes from https://github.com/microsoft/DeepSpeed/pull/1953/ for this script to work - There is a bug in PP layers in the above PR, so you may unexpectedly loose layers if reshaping to PP>1, see https://github.com/microsoft/DeepSpeed/pull/1953/files#r904886186 - Only tested on FP16 Checkpoints +- Only tested for decreasing the parallel dimensions - Data Parallelism is irrelevant here, as it does not influence non-ZeRO ckpts. """