diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py index bf5ac5360..5a388f867 100644 --- a/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py +++ b/applications/DeepSpeed-Chat/dschat/rlhf/ppo_trainer.py @@ -13,7 +13,9 @@ def print_all_ranks(tag, value, rank): world_size = torch.distributed.get_world_size() - all_tensor = torch.zeros(world_size, dtype=torch.float32, device=value.device) + all_tensor = torch.zeros(world_size, + dtype=torch.float32, + device=value.device) all_tensor[rank] = value torch.distributed.all_reduce(all_tensor, op=torch.distributed.ReduceOp.SUM) print_rank_0(f'{tag} {all_tensor}', rank) @@ -53,7 +55,8 @@ def __init__(self, rlhf_engine, args): self.end_of_conversation_token_id = self.tokenizer( args.end_of_conversation_token)['input_ids'][-1] self.z3_enabled = args.actor_zero_stage == 3 - self.calculate_fp32_loss = (self.args.dtype == "bf16") and self.args.bf16_to_fp32_loss + self.calculate_fp32_loss = (self.args.dtype + == "bf16") and self.args.bf16_to_fp32_loss # In case the generated experience is not valid (too short), we use the last valid # generated experience. Alternatively, we can skip the step (on all workers). @@ -89,15 +92,17 @@ def _generate_sequence(self, prompts, mask, step): if is_hpu() and self.args.enable_hpu_graphs: orig_actor_model_fwd_fn = self.actor_model.module.forward if self.first_generate: - self.actor_model.module.forward = thpu.wrap_in_hpu_graph_func(self.actor_model.module.forward) + self.actor_model.module.forward = thpu.wrap_in_hpu_graph_func( + self.actor_model.module.forward) self.first_generate = False else: self.actor_model.module.forward = self.actor_model_hpu_graph_wrapped_fwd_fn - seq = self.actor_model.module.generate(prompts, - attention_mask=mask, - max_length=max_min_length, - min_length=max_min_length, - lazy_mode=True) + seq = self.actor_model.module.generate( + prompts, + attention_mask=mask, + max_length=max_min_length, + min_length=max_min_length, + lazy_mode=True) self.actor_model_hpu_graph_wrapped_fwd_fn = self.actor_model.module.forward self.actor_model.module.forward = orig_actor_model_fwd_fn else: @@ -117,7 +122,8 @@ def _generate_sequence(self, prompts, mask, step): ans = seq[:, prompt_length:] valid_ans_len = (ans != self.tokenizer.pad_token_id).sum(dim=-1) - if self.args.print_answers and (step % self.args.print_answers_interval == 0): + if self.args.print_answers and (step % self.args.print_answers_interval + == 0): print( f"--- prompt --> step={step}, rank={torch.distributed.get_rank()}, {self.tokenizer.batch_decode(prompts, skip_special_tokens=True)}" ) @@ -129,17 +135,21 @@ def _generate_sequence(self, prompts, mask, step): for i in range(batch_size): if valid_ans_len[ i] <= 1: # if the answer is shorter than 1 token, drop it - print(f'Dropping too short generated answer: {step=}: \n' - f'prompts: {self.tokenizer.batch_decode(prompts, skip_special_tokens=False)}\n' - f'answers: {self.tokenizer.batch_decode(ans, skip_special_tokens=False)}') + print( + f'Dropping too short generated answer: {step=}: \n' + f'prompts: {self.tokenizer.batch_decode(prompts, skip_special_tokens=False)}\n' + f'answers: {self.tokenizer.batch_decode(ans, skip_special_tokens=False)}' + ) continue else: out_seq.append(seq[i:i + 1]) if not out_seq: - print(f'All generated results are too short for rank={self.args.local_rank} step={step}\n' - f'-> prompts: {self.tokenizer.batch_decode(prompts, skip_special_tokens=False)}\n' - f'-> answers: {self.tokenizer.batch_decode(ans, skip_special_tokens=False)}') + print( + f'All generated results are too short for rank={self.args.local_rank} step={step}\n' + f'-> prompts: {self.tokenizer.batch_decode(prompts, skip_special_tokens=False)}\n' + f'-> answers: {self.tokenizer.batch_decode(ans, skip_special_tokens=False)}' + ) return None out_seq = torch.cat(out_seq, dim=0) # concat output in the batch dim diff --git a/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py index cdc8ac8e3..52e13a446 100755 --- a/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py +++ b/applications/DeepSpeed-Chat/dschat/rlhf/rlhf_engine.py @@ -108,7 +108,8 @@ def _init_actor(self, actor_model_name_or_path): # TODO SW-146776: remove this WA once SW-141762 is resolved if is_hpu(): import habana_frameworks.torch.core as htcore - actor_model.to(dtype=torch.bfloat16, device=get_accelerator().device()) + actor_model.to(dtype=torch.bfloat16, + device=get_accelerator().device()) # Optimizer if self.args.offload: @@ -117,7 +118,9 @@ def _init_actor(self, actor_model_name_or_path): AdamOptimizer = torch.optim.AdamW else: AdamOptimizer = FusedAdam - print_rank_0(f'Using {AdamOptimizer.__name__} optimizer for actor model', self.args.global_rank) + print_rank_0( + f'Using {AdamOptimizer.__name__} optimizer for actor model', + self.args.global_rank) optim_params = get_optimizer_grouped_parameters( actor_model, self.args.actor_weight_decay, @@ -249,7 +252,8 @@ def _init_critic(self, critic_model_name_or_path): # TODO SW-146776: remove this WA once SW-141762 is resolved if is_hpu(): - critic_model.to(dtype=torch.bfloat16, device=get_accelerator().device()) + critic_model.to(dtype=torch.bfloat16, + device=get_accelerator().device()) # Optimizer # TODO SW-147425: change the file to use HPEX optimizer instead of AdamW on hpu @@ -259,7 +263,9 @@ def _init_critic(self, critic_model_name_or_path): AdamOptimizer = torch.optim.AdamW else: AdamOptimizer = FusedAdam - print_rank_0(f'Using {AdamOptimizer.__name__} optimizer for critic model', self.args.global_rank) + print_rank_0( + f'Using {AdamOptimizer.__name__} optimizer for critic model', + self.args.global_rank) optim_params = get_optimizer_grouped_parameters( critic_model, self.args.critic_weight_decay, diff --git a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py index deadc2a57..628b04826 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/data/data_utils.py @@ -298,8 +298,10 @@ def create_prompt_dataset(local_rank, eval_fname = f"{output_path}/evaldata_{fname}.pt" cache_found = os.path.isfile(train_fname) and os.path.isfile(eval_fname) - device = torch.device(get_accelerator().device_name(torch.distributed.get_rank())) - buf_create_cache = get_accelerator().ByteTensor([not cache_found], device=device) + device = torch.device(get_accelerator().device_name( + torch.distributed.get_rank())) + buf_create_cache = get_accelerator().ByteTensor([not cache_found], + device=device) torch.distributed.all_reduce(buf_create_cache) if local_rank <= 0 and (buf_create_cache.item() != 0 or reload): diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py index 34801d80b..6d13ac26d 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/model/model_utils.py @@ -84,7 +84,8 @@ def causal_lm_forward( def configure_dropout(model_config, dropout): if dropout is not None: - for key in ('dropout', 'attention_dropout', 'hidden_dropout', 'activation_dropout'): + for key in ('dropout', 'attention_dropout', 'hidden_dropout', + 'activation_dropout'): if hasattr(model_config, key): print(f"Setting model_config.{key} to {dropout}") setattr(model_config, key, dropout) @@ -92,27 +93,31 @@ def configure_dropout(model_config, dropout): def causal_lm_model_to_fp32_loss(model): """ Convert CausalLM model to calculate loss in fp32 """ - def causal_lm_forward(input_ids=None, - past_key_values=None, - attention_mask=None, - head_mask=None, - inputs_embeds=None, - labels=None, - use_cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **deprecated_arguments, ): - output = model.__original_forward__(input_ids=input_ids, - past_key_values=past_key_values, - attention_mask=attention_mask, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - labels=None, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict) + + def causal_lm_forward( + input_ids=None, + past_key_values=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **deprecated_arguments, + ): + output = model.__original_forward__( + input_ids=input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + labels=None, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict) return_dict = isinstance(output, dict) lm_logits = output.logits if return_dict else output[0] @@ -127,12 +132,12 @@ def causal_lm_forward(input_ids=None, # Flatten the tokens loss_fct = torch.nn.CrossEntropyLoss() loss = loss_fct( - shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length) - ) + shift_logits.view(batch_size * seq_length, vocab_size), + shift_labels.view(batch_size * seq_length)) if not return_dict: # re-pack output with fp32 loss - return ((loss,) + output) if loss is not None else output + return ((loss, ) + output) if loss is not None else output output.loss = loss return output diff --git a/applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py b/applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py index 9a3808cdc..38a79ab2c 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py +++ b/applications/DeepSpeed-Chat/dschat/utils/model/reward_model.py @@ -10,7 +10,12 @@ ## https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py class RewardModel(nn.Module): - def __init__(self, base_model, tokenizer, num_padding_at_beginning=0, loss_to_fp32=False, opt_loss_calc=False): + def __init__(self, + base_model, + tokenizer, + num_padding_at_beginning=0, + loss_to_fp32=False, + opt_loss_calc=False): super().__init__() self.config = base_model.config self.num_padding_at_beginning = num_padding_at_beginning @@ -112,7 +117,9 @@ def get_last_before_padding(paddings, num_begin_padding): # united_unpadding_mask will what are the unite between the unpadded elements # will indicate 1's where we have non padded tokens, in either of the inputs - united_unpadding_mask = torch.logical_not(torch.logical_and(chosen_padding_mask, rejected_padding_mask)) + united_unpadding_mask = torch.logical_not( + torch.logical_and(chosen_padding_mask, + rejected_padding_mask)) # get a mask of all the different tokens divergence_mask = (chosen_id != rejected_id) @@ -120,27 +127,40 @@ def get_last_before_padding(paddings, num_begin_padding): # loss mask indicates the elements which should be taken into consideration after sigmoid calc # from the first divergence, till the last non padded token - loss_mask = torch.logical_and(divergence_mask, united_unpadding_mask) - loss_mask = torch.where(divergence_mask.sum().bool(), loss_mask, self.fallback_mask) + loss_mask = torch.logical_and(divergence_mask, + united_unpadding_mask) + loss_mask = torch.where(divergence_mask.sum().bool(), + loss_mask, self.fallback_mask) # calc logsigmoid on all the input and mask the not interesting ones if self.loss_to_fp32: chosen_reward = chosen_reward.float() rejected_reward = rejected_reward.float() - logsigmoid = torch.nn.functional.logsigmoid(chosen_reward.float() - rejected_reward.float()) * loss_mask + logsigmoid = torch.nn.functional.logsigmoid( + chosen_reward.float() - + rejected_reward.float()) * loss_mask #average according to the interesting number of elements num_elements_in_loss = loss_mask.sum().float() loss += -(logsigmoid.sum() / num_elements_in_loss) # log the c_ind / r_ind in chosen_mean_scores / rejected_mean_scores - c_ind_mask = get_last_before_padding(chosen_padding_mask, self.num_padding_at_beginning) - c_ind_mask = torch.where(chosen_padding_mask.sum() > self.num_padding_at_beginning, c_ind_mask, self.fallback_mask) - chosen_mean_score = (c_ind_mask.float() * chosen_reward.float()).sum() + c_ind_mask = get_last_before_padding( + chosen_padding_mask, self.num_padding_at_beginning) + c_ind_mask = torch.where( + chosen_padding_mask.sum() > self.num_padding_at_beginning, + c_ind_mask, self.fallback_mask) + chosen_mean_score = (c_ind_mask.float() * + chosen_reward.float()).sum() chosen_mean_scores.append(chosen_mean_score) - r_ind_mask = get_last_before_padding(rejected_padding_mask, self.num_padding_at_beginning) - r_ind_mask = torch.where(rejected_padding_mask.sum() > self.num_padding_at_beginning, r_ind_mask, self.fallback_mask) - rejected_mean_score = (r_ind_mask.float() * rejected_reward.float()).sum() + r_ind_mask = get_last_before_padding( + rejected_padding_mask, self.num_padding_at_beginning) + r_ind_mask = torch.where( + rejected_padding_mask.sum() > + self.num_padding_at_beginning, r_ind_mask, + self.fallback_mask) + rejected_mean_score = (r_ind_mask.float() * + rejected_reward.float()).sum() rejected_mean_scores.append(rejected_mean_score) else: c_inds = (chosen_id == self.PAD_ID).nonzero() @@ -156,7 +176,8 @@ def get_last_before_padding(paddings, num_begin_padding): # Check if there is any padding otherwise take length of sequence r_inds = (rejected_id == self.PAD_ID).nonzero() r_ind = r_inds[self.num_padding_at_beginning].item( - ) if len(r_inds) > self.num_padding_at_beginning else seq_len + ) if len( + r_inds) > self.num_padding_at_beginning else seq_len end_ind = max(c_ind, r_ind) divergence_ind = check_divergence[0] assert divergence_ind > 0 @@ -165,8 +186,8 @@ def get_last_before_padding(paddings, num_begin_padding): if self.loss_to_fp32: c_truncated_reward = c_truncated_reward.float() r_truncated_reward = r_truncated_reward.float() - loss += -torch.nn.functional.logsigmoid(c_truncated_reward - - r_truncated_reward).mean() + loss += -torch.nn.functional.logsigmoid( + c_truncated_reward - r_truncated_reward).mean() chosen_mean_scores.append( chosen_reward[c_ind - 1]) #use the end score for reference diff --git a/applications/DeepSpeed-Chat/dschat/utils/utils.py b/applications/DeepSpeed-Chat/dschat/utils/utils.py index e98dfad35..7ef7e0df9 100644 --- a/applications/DeepSpeed-Chat/dschat/utils/utils.py +++ b/applications/DeepSpeed-Chat/dschat/utils/utils.py @@ -92,7 +92,9 @@ def get_tokenizer(model_name_or_path, fast_tokenizer=True): return tokenizer -def load_hf_tokenizer(model_name_or_path, fast_tokenizer=True, add_special_tokens=None): +def load_hf_tokenizer(model_name_or_path, + fast_tokenizer=True, + add_special_tokens=None): if os.path.exists(model_name_or_path): # Locally tokenizer loading has some issue, so we need to force download model_json = os.path.join(model_name_or_path, "config.json") @@ -109,7 +111,8 @@ def load_hf_tokenizer(model_name_or_path, fast_tokenizer=True, add_special_token if add_special_tokens is not None: add_special_tokens = [add_special_tokens] if isinstance(add_special_tokens, str) \ else add_special_tokens - tokenizer.add_special_tokens({'additional_special_tokens': add_special_tokens}) + tokenizer.add_special_tokens( + {'additional_special_tokens': add_special_tokens}) return tokenizer @@ -208,7 +211,10 @@ def get_optimizer_grouped_parameters( model, weight_decay, lora_lr=5e-4, - no_decay_name_list=["bias", "layer_norm.weight", "layernorm.weight", "norm.weight", "ln_f.weight"], + no_decay_name_list=[ + "bias", "layer_norm.weight", "layernorm.weight", "norm.weight", + "ln_f.weight" + ], lora_name_list=["lora_right_weight", "lora_left_weight"], ): optimizer_grouped_parameters = [ @@ -313,7 +319,8 @@ def print_loss(epoch, step, steps_per_print, gas, loss, loss_sum, rank): opt_step = step / gas avg_loss = loss_sum / gas print_rank_0( - f"[{datetime.now()}] epoch: {epoch} | step: {opt_step} | avg_loss: {avg_loss}", rank) + f"[{datetime.now()}] epoch: {epoch} | step: {opt_step} | avg_loss: {avg_loss}", + rank) if step > 0 and step % gas == 0: loss_sum.zero_() diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step1_bloom_1.1b.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step1_bloom_1.1b.sh deleted file mode 100755 index 51cfb918a..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step1_bloom_1.1b.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -export HL_DATASET_PATH="Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets" -./train_step1_bloom_1.1b.sh diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step1_bloom_1.1b_lora.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step1_bloom_1.1b_lora.sh deleted file mode 100755 index 61c674338..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step1_bloom_1.1b_lora.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -export HL_DATASET_PATH="Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets" -export HL_GBS=128 -export HL_LEARNING_RATE=0. -export HL_LORA_LEARNING_RATE=1.1e-2 -export HL_WEIGHT_DECAY=0.1 -export HL_LORA_DIM=128 -export HL_DROPOUT=0.1 -export HL_EPOCHS=4 -./train_step1_bloom_1.1b.sh diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step2_bloom_560m.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step2_bloom_560m.sh deleted file mode 100755 index 5501d2262..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step2_bloom_560m.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -export HL_DATASET_PATH="Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets" -./train_step2_bloom_560m.sh diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step2_bloom_560m_lora.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step2_bloom_560m_lora.sh deleted file mode 100755 index a7052edfe..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step2_bloom_560m_lora.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -export HL_DATASET_PATH="Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets" -export HL_CRITIC_ZERO_STAGE=1 -export HL_MBS=8 -export HL_GBS=256 -export HL_CRITIC_MODEL=bigscience/bloom-560m -export HL_LEARNING_RATE=2e-5 -export HL_LORA_LEARNING_RATE=5e-3 -export HL_WEIGHT_DECAY=0.1 -export HL_EPOCHS=3 -export HL_LORA_DIM=128 -export HL_DROPOUT=0.0 -./train_step2_bloom_560m.sh diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step3_bloom_1.1b_560m.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step3_bloom_1.1b_560m.sh deleted file mode 100755 index 2d65582cb..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step3_bloom_1.1b_560m.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -export HL_DATASET_PATH="Dahoas/rm-static" -./train_step3_bloom_1.1b_560m.sh diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step3_bloom_1.1b_560m_lora.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step3_bloom_1.1b_560m_lora.sh deleted file mode 100755 index 0ac65c4de..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/run_step3_bloom_1.1b_560m_lora.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -export HL_DATASET_PATH="Dahoas/rm-static" -export HL_ACTOR_CP_ACT=0 -export HL_CRITIC_CP_ACT=0 -export HL_ACTOR_ZERO_STAGE=1 -export HL_CRITIC_ZERO_STAGE=1 -export HL_MBS=2 -export HL_GBS=64 -export HL_HYBRID_ENGINE=0 -export HL_ACTOR_LR=0.0 -export HL_LORA_ACTOR_LR=4e-4 -export HL_ACTOR_WD=0.1 -export HL_CRITIC_LR=0.0 -export HL_LORA_CRITIC_LR=6e-4 -export HL_CRITIC_WD=0.1 -export HL_LORA_DIM=128 -export HL_ACTOR_DROPOUT=0.0 -export HL_CRITIC_DROPOUT=0.0 -export HL_EPOCHS=1 -export HL_NUM_WARMUP_STEPS=100 -export HL_PRINT_ANSWERS_INTERVAL=0 -export HL_SEED=${HL_SEED:=1} -./train_step3_bloom_1.1b_560m.sh diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step1_bloom_1.1b.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step1_bloom_1.1b.sh deleted file mode 100755 index 85291a229..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step1_bloom_1.1b.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company - -# ----------------------------------------------------------------------- -# RLHF step1 reference training script for Bloom-1.1B model -# ----------------------------------------------------------------------- - -set -ex - -DATA_DIR_ROOT=${HL_DATA_DIR_ROOT:-/mnt/weka} -tag=${HL_TAG:-default_tag} -base_out_path=${HL_BASE_OUT_PATH:-/root/logs} -n_nodes=${HL_NUM_NODES:-1} -n_devices_per_node=${HL_DEVICES_PER_NODE:-8} -act_zero_stage=${HL_ACTOR_ZERO_STAGE:-1} -ckp_act=${HL_ACTOR_CP_ACT:-0} -seed=${HL_SEED:-10} -mbs=${HL_MBS:-8} -gbs=${HL_GBS:-128} -tensorboard_path=${HL_TENSORBOARD_PATH:-} -log_file=${HL_LOG_FILE:-} -checkpoint_path=${HL_CHECKPOINT_PATH:-} -master_port=${HL_MASTER_PORT:-29500} -model_name_or_path=${HL_ACTOR_MODEL:-bigscience/bloom-1b1} -dataset_path=${HL_DATASET_PATH} -learning_rate=${HL_LEARNING_RATE:-2e-5} -lora_learning_rate=${HL_LORA_LEARNING_RATE:-2e-5} -weight_decay=${HL_WEIGHT_DECAY:-0.0} -lora_dim=${HL_LORA_DIM:-0} -dropout=${HL_DROPOUT:-0.1} -epochs=${HL_EPOCHS:-4} - -# Calculate GAS given global batch, n_nodes, n_devices_per_node -total_devices=$(($n_nodes*$n_devices_per_node)) -per_device_batch=$(($gbs/$total_devices)) -gas=$(($per_device_batch/$mbs)) - -# set gradient checkpointing arguments -ckp_act_args="" -if [ "$ckp_act" -eq "1" ]; then - ckp_act_args="--gradient_checkpointing " -fi - -# setup checkpoint, tensorboard and log path -prefix_name=${tag}/bloom/step1/1.1b -run_name=gb_${gbs}_mbs_${mbs}_lr_${learning_rate}_do_${dropout}_wd_${weight_decay}_ep_${epochs} - -lora_args="" -if [ "$lora_dim" -ne "0" ]; then - lora_args="--lora_dim ${lora_dim} --lora_learning_rate ${lora_learning_rate} --lora_module_name transformer.h. --only_optimize_lora " - run_name=${run_name}_lora_lr_${lora_learning_rate} -fi - -if [ -z "$tensorboard_path" ]; then - tensorboard_path=${base_out_path}/tensorboard/${prefix_name} -fi - -if [ -z "$log_file" ]; then - log_file=${base_out_path}/logs/${prefix_name}/${run_name}.txt -fi - -if [ -z "$checkpoint_path" ]; then - checkpoint_path=${base_out_path}/checkpoints/${prefix_name}/${run_name} -fi - -if [ "$n_nodes" -ne "1" -a -f "$HOSTSFILE" ] -then - MULTINODE_CMD="--hostfile=$HOSTSFILE \ - --master_addr $(head -n 1 $HOSTSFILE | sed -n s/[[:space:]]slots.*//p) " -fi - -# create required paths -# if log-file/tb-path provided, caller should make sure directories exist -mkdir -p ${base_out_path}/logs/${prefix_name} - -# RUN -script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -training_dir=$( realpath $script_dir/../../../training) -cd ${training_dir} - -CMD="step1_supervised_finetuning/main.py \ - --model_name_or_path ${model_name_or_path} \ - --data_path ${dataset_path} \ - ${lora_args} \ - --dtype bf16 \ - --learning_rate ${learning_rate} \ - --dropout ${dropout} \ - --weight_decay ${weight_decay} \ - --per_device_train_batch_size ${mbs} \ - --gradient_accumulation_steps ${gas} \ - --num_train_epochs ${epochs} \ - --num_warmup_steps 20 \ - --zero_stage ${act_zero_stage} \ - ${ckp_act_args} \ - --per_device_eval_batch_size 8 \ - --seed ${seed} \ - --deepspeed \ - --output_dir ${checkpoint_path} \ - --enable_tensorboard \ - --tensorboard_path ${tensorboard_path} \ - --print_loss \ - --no_fused_kernels" - -deepspeed --num_nodes ${n_nodes} \ - --num_gpus ${n_devices_per_node} \ - --master_port ${master_port} \ - $MULTINODE_CMD \ - $CMD |& tee ${log_file} -exit $PIPESTATUS \ No newline at end of file diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step2_bloom_560m.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step2_bloom_560m.sh deleted file mode 100755 index 76139bc0e..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step2_bloom_560m.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company - -# ----------------------------------------------------------------------- -# RLHF step2 reference training script for Bloom-560m model -# ----------------------------------------------------------------------- - -set -ex - -DATA_DIR_ROOT=${HL_DATA_DIR_ROOT:-/mnt/weka} -tag=${HL_TAG:-default_tag} -base_out_path=${HL_BASE_OUT_PATH:-/root/logs} -n_nodes=${HL_NUM_NODES:-1} -n_devices_per_node=${HL_DEVICES_PER_NODE:-8} -cri_zero_stage=${HL_CRITIC_ZERO_STAGE:-1} -seed=${HL_SEED:-10} -mbs=${HL_MBS:-8} -gbs=${HL_GBS:-64} -tensorboard_path=${HL_TENSORBOARD_PATH:-} -log_file=${HL_LOG_FILE:-} -checkpoint_path=${HL_CHECKPOINT_PATH:-} -master_port=${HL_MASTER_PORT:-29500} -model_name_or_path=${HL_CRITIC_MODEL:-bigscience/bloom-560m} -dataset_path=${HL_DATASET_PATH} -learning_rate=${HL_LEARNING_RATE:-2e-5} -lora_learning_rate=${HL_LORA_LEARNING_RATE:-2e-5} -weight_decay=${HL_WEIGHT_DECAY:-0.0} -epochs=${HL_EPOCHS:-2} -lora_dim=${HL_LORA_DIM:-0} -dropout=${HL_DROPOUT:-0.0} - -# Calculate GAS given global batch, n_nodes, n_devices_per_node -total_devices=$(($n_nodes*$n_devices_per_node)) -per_device_batch=$(($gbs/$total_devices)) -gas=$(($per_device_batch/$mbs)) - -# setup checkpoint, tensorboard and log path -prefix_name=${tag}/bloom/step2/560m -run_name=gb_${gbs}_mbs_${mbs}_lr_${learning_rate}_do_${dropout}_wd_${weight_decay}_ep_${epochs} - -lora_args="" -if [ "$lora_dim" -ne "0" ]; then - lora_args="--lora_dim ${lora_dim} --lora_learning_rate ${lora_learning_rate} --lora_module_name rwtranrsformer.h. --only_optimize_lora " - run_name=${run_name}_lora_lr_${lora_learning_rate} -fi - -if [ -z "$tensorboard_path" ]; then - tensorboard_path=${base_out_path}/tensorboard/${prefix_name} -fi - -if [ -z "$log_file" ]; then - log_file=${base_out_path}/logs/${prefix_name}/${run_name}.txt -fi - -if [ -z "$checkpoint_path" ]; then - checkpoint_path=${base_out_path}/checkpoints/${prefix_name}/${run_name} -fi - -if [ "$n_nodes" -ne "1" -a -f "$HOSTSFILE" ] -then - MULTINODE_CMD="--hostfile=$HOSTSFILE \ - --master_addr $(head -n 1 $HOSTSFILE | sed -n s/[[:space:]]slots.*//p) " -fi - -# create required paths -# if log-file/tb-path provided, caller should make sure directories exist -mkdir -p ${base_out_path}/logs/${prefix_name} - -# RUN -script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -training_dir=$( realpath $script_dir/../../../training) -cd ${training_dir} - -CMD="step2_reward_model_finetuning/main.py \ - --model_name_or_path ${model_name_or_path} \ - --data_path ${dataset_path} \ - ${lora_args} \ - --dtype bf16 \ - --learning_rate ${learning_rate} \ - --dropout ${dropout} \ - --weight_decay ${weight_decay} \ - --per_device_train_batch_size ${mbs} \ - --gradient_accumulation_steps ${gas} \ - --num_train_epochs ${epochs} \ - --num_padding_at_beginning 0 \ - --zero_stage ${cri_zero_stage} \ - --seed ${seed} \ - --optimized_reward_loss_calc \ - --deepspeed \ - --output_dir ${checkpoint_path} \ - --enable_tensorboard \ - --tensorboard_path ${tensorboard_path} \ - --print_loss \ - --no_fused_kernels" - -deepspeed --num_nodes ${n_nodes} \ - --num_gpus ${n_devices_per_node} \ - --master_port ${master_port} \ - $MULTINODE_CMD \ - $CMD |& tee ${log_file} -exit $PIPESTATUS - -# --eval_interval 100 \ -# --eval_iters 100 \ diff --git a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step3_bloom_1.1b_560m.sh b/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step3_bloom_1.1b_560m.sh deleted file mode 100755 index 9b1d5fd40..000000000 --- a/applications/DeepSpeed-Chat/internal/training_scripts/bloom/train_step3_bloom_1.1b_560m.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/bin/bash -# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company - -# ----------------------------------------------------------------------- -# RLHF step3 reference training script for Bloom-1.1B + Bloom-560m models -# ----------------------------------------------------------------------- - -set -ex - -tag=${HL_TAG-default_tag} -base_out_path=${HL_BASE_OUT_PATH:-/root/logs} -act_model_path=${HL_ACTOR_MODEL_PATH} -cri_model_path=${HL_CRITIC_MODEL_PATH} -dataset_path=${HL_DATASET_PATH} -n_nodes=${HL_NUM_NODES:-1} -n_devices_per_node=${HL_DEVICES_PER_NODE:-8} -act_ckp_act=${HL_ACTOR_CP_ACT:-0} -cri_ckp_act=${HL_CRITIC_CP_ACT:-0} -act_zero_stage=${HL_ACTOR_ZERO_STAGE:-1} -cri_zero_stage=${HL_CRITIC_ZERO_STAGE:-1} -seed=${HL_SEED:-10} -mbs=${HL_MBS:-2} -gbs=${HL_GBS:-64} -tensorboard_path=${HL_TENSORBOARD_PATH:-} -log_file=${HL_LOG_FILE:-} -checkpoint_path=${HL_CHECKPOINT_PATH:-} -master_port=${HL_MASTER_PORT:-29500} -hybrid_engine=${HL_HYBRID_ENGINE:-0} -actor_learning_rate=${HL_ACTOR_LR:-1e-5} -lora_actor_learning_rate=${HL_LORA_ACTOR_LR:-4e-4} -actor_weight_decay=${HL_ACTOR_WD:-0.1} -critic_learning_rate=${HL_CRITIC_LR:-6e-6} -lora_critic_learning_rate=${HL_LORA_CRITIC_LR:-6e-4} -critic_weight_decay=${HL_CRITIC_WD:-0.1} -lora_dim=${HL_LORA_DIM:-0} -actor_dropout=${HL_ACTOR_DROPOUT:-0.0} -critic_dropout=${HL_CRITIC_DROPOUT:-0.0} -epochs=${HL_EPOCHS:-1} -num_warmup_steps=${HL_NUM_WARMUP_STEPS:-100} -print_answers_interval=${HL_PRINT_ANSWERS_INTERVAL:-0} - -# Calculate GAS given global batch, n_nodes, n_devices_per_node -total_devices=$(($n_nodes*$n_devices_per_node)) -per_device_batch=$(($gbs/$total_devices)) -gas=$(($per_device_batch/$mbs)) - -# set LORA args -lora_args="" -if [ "$lora_dim" -ne "0" ]; then - lora_args=" --actor_lora_dim ${lora_dim} --actor_lora_module_name transformer.h.\ - --actor_lora_learning_rate ${lora_actor_learning_rate} \ - --critic_lora_dim ${lora_dim} --critic_lora_module_name rwtranrsformer.h. \ - --critic_lora_learning_rate ${lora_critic_learning_rate} \ - --only_optimize_lora " - run_name=${run_name}_lora_act_lr_${lora_actor_learning_rate}_lora_cri_lr_${lora_critic_learning_rate} -fi - -# set gradient checkpointing arguments -ckp_act_args="" -if [ "$act_ckp_act" -eq "1" ]; then - ckp_act_args="--actor_gradient_checkpointing " -fi -if [ "$cri_ckp_act" -eq "1" ]; then - ckp_act_args="$ckp_act_args --critic_gradient_checkpointing " -fi - -# enable hybrid engine -hybrid_engine_args="" -if [ "$hybrid_engine" -eq "1" ]; then - hybrid_engine_args="--enable_hybrid_engine " -fi - -# setup checkpoint, tensorboard and log path -prefix_name=${tag}/bloom/step3/1.1b_560m -run_name=gb_${gbs}_mbs_${mbs}_ep_${epochs}_act_lr_${actor_learning_rate}_do_${actor_dropout}_wd_${actor_weight_decay}_cri_lr_${critic_learning_rate}_do_${critic_dropout}_wd_${critic_weight_decay}_lora_${lora_dim} - -if [ -z "$tensorboard_path" ]; then - tensorboard_path=${base_out_path}/tensorboard/${prefix_name} -fi - -if [ -z "$log_file" ]; then - log_file=${base_out_path}/logs/${prefix_name}/${run_name}.txt -fi - -if [ -z "$checkpoint_path" ]; then - checkpoint_path=${base_out_path}/checkpoints/${prefix_name}/${run_name} -fi - -# configure print answers settings -print_answers_args="" -if [ "$print_answers_interval" -ne "0" ]; then - print_answers_args="--print_answers --print_answers_interval ${print_answers_interval}" -fi - -if [ "$n_nodes" -ne "1" -a -f "$HOSTSFILE" ] -then - MULTINODE_CMD="--hostfile=$HOSTSFILE \ - --master_addr $(head -n 1 $HOSTSFILE | sed -n s/[[:space:]]slots.*//p) " -fi - -# create required paths -# if log-file/tb-path provided, caller should make sure directories exist -mkdir -p ${base_out_path}/logs/${prefix_name} - -# RUN -script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -training_dir=$( realpath $script_dir/../../../training) -cd ${training_dir} - -CMD="step3_rlhf_finetuning/main.py \ - --dtype bf16 \ - --actor_model_name_or_path ${act_model_path} \ - --critic_model_name_or_path ${cri_model_path} \ - ${lora_args} \ - --data_path ${dataset_path} \ - --num_train_epochs ${epochs} \ - --actor_zero_stage ${act_zero_stage} \ - --critic_zero_stage ${cri_zero_stage} \ - --num_padding_at_beginning 0 \ - --per_device_generation_batch_size ${mbs} \ - --per_device_training_batch_size ${mbs} \ - --gradient_accumulation_steps ${gas} \ - --actor_learning_rate ${actor_learning_rate} \ - --critic_learning_rate ${critic_learning_rate} \ - --num_warmup_steps ${num_warmup_steps} \ - --actor_weight_decay ${actor_weight_decay} \ - --critic_weight_decay ${critic_weight_decay} \ - --actor_dropout ${actor_dropout} \ - --critic_dropout ${critic_dropout} \ - ${ckp_act_args} \ - ${hybrid_engine_args} \ - --seed ${seed} \ - --deepspeed \ - --output_dir ${checkpoint_path} \ - --enable_tensorboard \ - --tensorboard_path ${tensorboard_path} \ - --no_fused_kernels \ - ${print_answers_args} \ - --enable_hpu_graphs" - -deepspeed --num_nodes ${n_nodes} \ - --num_gpus ${n_devices_per_node} \ - --master_port ${master_port} \ - $MULTINODE_CMD \ - $CMD |& tee ${log_file} -exit $PIPESTATUS diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py index fb4160b84..d183accee 100755 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/main.py @@ -136,11 +136,12 @@ def parse_args(): parser.add_argument('--gradient_checkpointing', action='store_true', help='Enable HF gradient checkpointing for model.') - parser.add_argument("--dropout", - type=float, - default=None, - help="If dropout configured, use it. " - "Otherwise, keep the default dropout configuration of the model.") + parser.add_argument( + "--dropout", + type=float, + default=None, + help="If dropout configured, use it. " + "Otherwise, keep the default dropout configuration of the model.") # deepspeed features parser.add_argument('--offload', action='store_true', @@ -175,11 +176,13 @@ def parse_args(): "Initial LoRA learning rate (after the potential warmup period) to use." ) ## bf16 - parser.add_argument('--no_bf16_to_fp32_loss', - action='store_false', - dest='bf16_to_fp32_loss', - help='Relevant only with bf16 dtype. ' - 'If specified, loss is calculated in bf16. Otherwise, calculated in fp32.') + parser.add_argument( + '--no_bf16_to_fp32_loss', + action='store_false', + dest='bf16_to_fp32_loss', + help='Relevant only with bf16 dtype. ' + 'If specified, loss is calculated in bf16. Otherwise, calculated in fp32.' + ) ## Tensorboard logging parser.add_argument('--enable_tensorboard', action='store_true', @@ -199,9 +202,10 @@ def parse_args(): help="Specify the format of the `eot_token`", ) ## Print loss - parser.add_argument('--print_loss', - action='store_true', - help='Prints loss at deepspeed config steps_per_print interval.') + parser.add_argument( + '--print_loss', + action='store_true', + help='Prints loss at deepspeed config steps_per_print interval.') ## Debug parser.add_argument('--no_fused_kernels', action='store_true', @@ -261,7 +265,9 @@ def main(): dropout=args.dropout) if (args.dtype == "bf16") and args.bf16_to_fp32_loss: - print_rank_0(f"Using model {model.__class__.__name__} with loss in fp32", args.global_rank) + print_rank_0( + f"Using model {model.__class__.__name__} with loss in fp32", + args.global_rank) causal_lm_model_to_fp32_loss(model) if args.lora_dim > 0: @@ -272,7 +278,7 @@ def main(): model = make_model_gradient_checkpointing_compatible(model) if is_hpu(): # TODO SW-146602: remove this WA when SW-141762 is resolved - model.to(dtype=torch.bfloat16, device=get_accelerator().device_name()) + model.to(dtype=torch.bfloat16, device=get_accelerator().device_name()) # Prepare the data train_phase = 1 @@ -388,12 +394,14 @@ def evaluation(model, eval_dataloader): hpu_mark_step() end = time.time() if torch.distributed.get_rank() == 0: - hf_model = model.model if hasattr(model, 'model') else model.module + hf_model = model.model if hasattr(model, + 'model') else model.module print_throughput(hf_model, args, end - start, args.global_rank) if args.print_loss: steps_per_print = ds_config['steps_per_print'] - loss_sum = print_loss(epoch, step, steps_per_print, args.gradient_accumulation_steps, - loss, loss_sum, args.global_rank) + loss_sum = print_loss(epoch, step, steps_per_print, + args.gradient_accumulation_steps, loss, + loss_sum, args.global_rank) # Evaluate perplexity on the validation set. print_rank_0( diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py index 53a67c259..a25b0edea 100644 --- a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py +++ b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py @@ -70,9 +70,10 @@ def parse_args(): type=str, default="English", choices=["English", "Chinese", "Japanese"]) - parser.add_argument("--add_eot_token", - action='store_true', - help="Add <|endoftext|> as additional special token to tokenizer") + parser.add_argument( + "--add_eot_token", + action='store_true', + help="Add <|endoftext|> as additional special token to tokenizer") args = parser.parse_args() diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py index e5a578af8..1bc72e636 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/main.py @@ -134,11 +134,12 @@ def parse_args(): '--gradient_checkpointing', action='store_true', help='Enable HF gradient checkpointing for Actor model.') - parser.add_argument("--dropout", - type=float, - default=None, - help="If dropout configured, use it. " - "Otherwise, keep the default dropout configuration of the model.") + parser.add_argument( + "--dropout", + type=float, + default=None, + help="If dropout configured, use it. " + "Otherwise, keep the default dropout configuration of the model.") # deepspeed features parser.add_argument('--offload', action='store_true', @@ -173,11 +174,13 @@ def parse_args(): "Initial LoRA learning rate (after the potential warmup period) to use." ) ## bf16 - parser.add_argument('--no_bf16_to_fp32_loss', - action='store_false', - dest='bf16_to_fp32_loss', - help='Relevant only with bf16 dtype. ' - 'If specified, loss is calculated in bf16. Otherwise, calculated in fp32.') + parser.add_argument( + '--no_bf16_to_fp32_loss', + action='store_false', + dest='bf16_to_fp32_loss', + help='Relevant only with bf16 dtype. ' + 'If specified, loss is calculated in bf16. Otherwise, calculated in fp32.' + ) # Evaluation parser.add_argument("--eval_interval", type=int, @@ -195,22 +198,27 @@ def parse_args(): type=str, default="step2_tensorboard") ## Tokenizer - parser.add_argument("--add_eot_token", - action='store_true', - help="Add <|endoftext|> as additional special token to tokenizer") + parser.add_argument( + "--add_eot_token", + action='store_true', + help="Add <|endoftext|> as additional special token to tokenizer") ## Print loss - parser.add_argument('--print_loss', - action='store_true', - help='Prints loss at deepspeed config steps_per_print interval.') + parser.add_argument( + '--print_loss', + action='store_true', + help='Prints loss at deepspeed config steps_per_print interval.') ## Debug parser.add_argument('--no_fused_kernels', action='store_true', help='Do not use cuda fused kernels.') ## UPH - parser.add_argument("--optimized_reward_loss_calc", - action='store_true', - help="Whether to use an optimized approach for RM loss calculation, or legacy flow") + parser.add_argument( + "--optimized_reward_loss_calc", + action='store_true', + help= + "Whether to use an optimized approach for RM loss calculation, or legacy flow" + ) ## DeepSpeed parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() @@ -260,14 +268,15 @@ def main(): add_special_tokens=additional_special_tokens) loss_to_fp32 = (args.dtype == "bf16") and args.bf16_to_fp32_loss - rm_model = create_critic_model(args.model_name_or_path, - tokenizer, - ds_config, - args.num_padding_at_beginning, - dropout=args.dropout, - zero_stage=args.zero_stage, - loss_to_fp32=loss_to_fp32, - optimized_reward_loss_calc=args.optimized_reward_loss_calc) + rm_model = create_critic_model( + args.model_name_or_path, + tokenizer, + ds_config, + args.num_padding_at_beginning, + dropout=args.dropout, + zero_stage=args.zero_stage, + loss_to_fp32=loss_to_fp32, + optimized_reward_loss_calc=args.optimized_reward_loss_calc) # Model bigscience/bloom-560m has large variance at ln_f.weight parameter # This makes bf16 finetuning hard. @@ -295,7 +304,8 @@ def main(): args.lora_dim) if args.only_optimize_lora: force_optimize_params.append('v_head.weight') - rm_model = only_optimize_lora_parameters(rm_model, force_optimize_params) + rm_model = only_optimize_lora_parameters(rm_model, + force_optimize_params) rm_model = make_model_gradient_checkpointing_compatible(rm_model) # TODO SW-146776: remove this WA once SW-141762 is resolved @@ -400,10 +410,12 @@ def evaluation_reward(model, dataloader, eval_iters): print_rank_0( f"***** Evaluating reward, Epoch {0}/{args.num_train_epochs} *****", args.global_rank) - reward_score, reject_score, acc = evaluation_reward(rm_model, eval_dataloader, args.eval_iters) - print_rank_0(f"chosen_last_scores (higher is better) : {reward_score}, " - f"rejected_last_scores (lower is better) : {reject_score}, " - f"acc (higher is better) : {acc}", args.global_rank) + reward_score, reject_score, acc = evaluation_reward( + rm_model, eval_dataloader, args.eval_iters) + print_rank_0( + f"chosen_last_scores (higher is better) : {reward_score}, " + f"rejected_last_scores (lower is better) : {reject_score}, " + f"acc (higher is better) : {acc}", args.global_rank) total_micro_steps = 0 for epoch in range(args.num_train_epochs): @@ -423,17 +435,24 @@ def evaluation_reward(model, dataloader, eval_iters): hpu_mark_step() if args.print_loss: steps_per_print = ds_config['steps_per_print'] - loss_sum = print_loss(epoch, step, steps_per_print, args.gradient_accumulation_steps, - loss, loss_sum, args.global_rank) + loss_sum = print_loss(epoch, step, steps_per_print, + args.gradient_accumulation_steps, loss, + loss_sum, args.global_rank) mean_loss += loss.item() total_micro_steps += 1 - gas_boundary = (total_micro_steps % args.gradient_accumulation_steps == 0) + gas_boundary = (total_micro_steps % + args.gradient_accumulation_steps == 0) total_steps = total_micro_steps // args.gradient_accumulation_steps - if args.eval_interval and gas_boundary and (total_steps % args.eval_interval == 0): - print_rank_0(f"Iter {total_steps}: Evaluating reward", args.global_rank) - reward_score, reject_score, acc = evaluation_reward(rm_model, eval_dataloader, args.eval_iters) - print_rank_0(f"Iter {total_steps}: c_scores: {reward_score}, r_scores: {reject_score}, " - f"diff: {reward_score - reject_score}, acc: {acc}", args.global_rank) + if args.eval_interval and gas_boundary and ( + total_steps % args.eval_interval == 0): + print_rank_0(f"Iter {total_steps}: Evaluating reward", + args.global_rank) + reward_score, reject_score, acc = evaluation_reward( + rm_model, eval_dataloader, args.eval_iters) + print_rank_0( + f"Iter {total_steps}: c_scores: {reward_score}, r_scores: {reject_score}, " + f"diff: {reward_score - reject_score}, acc: {acc}", + args.global_rank) print_rank_0( f"Epoch {epoch+1}/{args.num_train_epochs} with loss {loss_sum.get_mean()}", @@ -442,10 +461,12 @@ def evaluation_reward(model, dataloader, eval_iters): print_rank_0( f"***** Evaluating reward, Epoch {epoch+1}/{args.num_train_epochs} *****", args.global_rank) - reward_score, reject_score, acc = evaluation_reward(rm_model, eval_dataloader, args.eval_iters) - print_rank_0(f"chosen_last_scores (higher is better) : {reward_score}, " - f"rejected_last_scores (lower is better) : {reject_score}, " - f"acc (higher is better) : {acc}", args.global_rank) + reward_score, reject_score, acc = evaluation_reward( + rm_model, eval_dataloader, args.eval_iters) + print_rank_0( + f"chosen_last_scores (higher is better) : {reward_score}, " + f"rejected_last_scores (lower is better) : {reject_score}, " + f"acc (higher is better) : {acc}", args.global_rank) rm_model.tput_timer.update_epoch_count() if args.output_dir is not None: diff --git a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py index 8f09fec37..05e08ad29 100644 --- a/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py +++ b/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/rw_eval.py @@ -32,24 +32,25 @@ def parse_args(): parser.add_argument( "--add_eot_token", action='store_true', - help="Add <|endoftext|> as additional special token to tokenizer" - ) + help="Add <|endoftext|> as additional special token to tokenizer") args = parser.parse_args() return args -def load_stuff(model_name_or_path, num_padding_at_beginning, additional_special_tokens): +def load_stuff(model_name_or_path, num_padding_at_beginning, + additional_special_tokens): tokenizer = load_hf_tokenizer(model_name_or_path, fast_tokenizer=True, add_special_tokens=additional_special_tokens) tokenizer.pad_token = tokenizer.eos_token - model = create_critic_model(model_name_or_path, - tokenizer, - ds_config=None, - num_padding_at_beginning=num_padding_at_beginning, - rlhf_training=False, - dropout=0.) + model = create_critic_model( + model_name_or_path, + tokenizer, + ds_config=None, + num_padding_at_beginning=num_padding_at_beginning, + rlhf_training=False, + dropout=0.) return model, tokenizer @@ -131,12 +132,13 @@ def run_pair_comparison(): for prompt, good_ans, bad_ans in zip(prompt_list, good_ans_list, bad_ans_list): - batch = prepare_datapair(prompt, - good_ans, - bad_ans, - tokenizer, - max_seq_len=512, - end_of_conversation_token=args.end_of_conversation_token) + batch = prepare_datapair( + prompt, + good_ans, + bad_ans, + tokenizer, + max_seq_len=512, + end_of_conversation_token=args.end_of_conversation_token) batch = to_device(batch, device) # Run inference with torch.no_grad(): @@ -166,11 +168,12 @@ def run_single_sample(): prompt = "Human: Explain the moon landing to a 6 year old in a few sentences." my_ans = "Assistant: The moon landing was a major milestone in the history of human exploration of the solar system. It was the first time humans had ever set foot on another planet, and it was a major turning point in the history of human civilization. The astronauts, Neil Armstrong, Buzz Aldrin, and Michael Collins, successfully landed the Apollo 11 spacecraft on the moon, marking the first time humans had ever set foot on another" - batch = prepare_singlesample(prompt, - my_ans, - tokenizer, - max_seq_len=512, - end_of_conversation_token=args.end_of_conversation_token) + batch = prepare_singlesample( + prompt, + my_ans, + tokenizer, + max_seq_len=512, + end_of_conversation_token=args.end_of_conversation_token) batch = to_device(batch, device) rm_model.eval() diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py index 966a7927e..2e9b7ee35 100644 --- a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py +++ b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py @@ -267,16 +267,20 @@ def parse_args(): '--critic_gradient_checkpointing', action='store_true', help='Enable HF gradient checkpointing for Critic model.') - parser.add_argument("--actor_dropout", - type=float, - default=None, - help="If actor dropout configured, use it. " - "Otherwise, keep the default dropout configuration of the actor model.") - parser.add_argument("--critic_dropout", - type=float, - default=None, - help="If critic dropout configured, use it. " - "Otherwise, keep the default dropout configuration of the critic model.") + parser.add_argument( + "--actor_dropout", + type=float, + default=None, + help="If actor dropout configured, use it. " + "Otherwise, keep the default dropout configuration of the actor model." + ) + parser.add_argument( + "--critic_dropout", + type=float, + default=None, + help="If critic dropout configured, use it. " + "Otherwise, keep the default dropout configuration of the critic model." + ) ## LoRA for efficient training setting parser.add_argument("--actor_lora_dim", type=int, @@ -321,12 +325,13 @@ def parse_args(): action='store_true', help='Enable Mixed Precision ZeRO++ for training and generation.') ## bf16 - parser.add_argument('--no_bf16_to_fp32_loss', - action='store_false', - dest='bf16_to_fp32_loss', - help='Relevant only with bf16 dtype. ' - 'If specified, loss is calculated in bf16. Otherwise, calculated in fp32. ' - 'This applies for both actor and critic models.') + parser.add_argument( + '--no_bf16_to_fp32_loss', + action='store_false', + dest='bf16_to_fp32_loss', + help='Relevant only with bf16 dtype. ' + 'If specified, loss is calculated in bf16. Otherwise, calculated in fp32. ' + 'This applies for both actor and critic models.') ## Tensorboard logging parser.add_argument('--enable_tensorboard', action='store_true', @@ -335,9 +340,10 @@ def parse_args(): type=str, default="step3_tensorboard") ## Tokenizer - parser.add_argument("--add_eot_token", - action='store_true', - help="Add <|endoftext|> as additional special token to tokenizer") + parser.add_argument( + "--add_eot_token", + action='store_true', + help="Add <|endoftext|> as additional special token to tokenizer") ## Actor/critic model overflow alignment parser.add_argument( '--align_overflow', @@ -347,10 +353,11 @@ def parse_args(): parser.add_argument('--print_answers', action='store_true', help='Print prompt and answers during training') - parser.add_argument("--print_answers_interval", - type=int, - default=1, - help="If --print_answers enabled, controls the printing interval.") + parser.add_argument( + "--print_answers_interval", + type=int, + default=1, + help="If --print_answers enabled, controls the printing interval.") ## Testing parser.add_argument( '--enable_test_mode', @@ -505,7 +512,6 @@ def main(): rlhf_engine.actor.optimizer.quantize_nontrainable_params() print_rank_0("Mixed Precision ZeRO++ enabled") - ppo_trainer = DeepSpeedPPOTrainerUnsupervised if unsupervised_training_enabled else DeepSpeedPPOTrainer trainer = ppo_trainer(rlhf_engine, args) @@ -516,7 +522,9 @@ def main(): args.per_device_training_batch_size) # Train! - print_rank_0(f"***** Running training (total_iters={num_total_iters}) *****", args.global_rank) + print_rank_0( + f"***** Running training (total_iters={num_total_iters}) *****", + args.global_rank) non_overflow_step_count = 0 step_average_reward = 0.