Skip to content

Commit

Permalink
Merge pull request #39 from pjlab-sys4nlp/data_mix
Browse files Browse the repository at this point in the history
CPT: add more args and exec scripts
  • Loading branch information
DaizeDong authored Nov 22, 2023
2 parents d6a3780 + 4aa350c commit f569ea8
Show file tree
Hide file tree
Showing 12 changed files with 870 additions and 125 deletions.
8 changes: 5 additions & 3 deletions scripts/analysis/get_layer_wise_score_scale_factor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

# llama_7B llama_13B llama_30B llama_base llama_3B
# llama2_7B llama2_13B llama2_30B llama2_base
llama_size="llama_13B"
model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons
llama_size="llama2_7B"
# model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons
model_path=/mnt/petrelfs/share_data/quxiaoye/models/llama2_7B

data_begin_index=0
data_end_index=500
batch_size=8
block_size=2048
# block_size=2048
block_size=4096

#save_folder=${llama_size}_dense
save_folder=${llama_size}_moe_trained
Expand Down
164 changes: 164 additions & 0 deletions scripts/cpt/dynamic_data_selection/baseline_112gpus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/bash

#SBATCH --job-name=cpt-llama2_random_scale4_112gpus_dynamic_data
#SBATCH --output=/mnt/petrelfs/share_data/quxiaoye/runs/llama2_random_scale4_112gpus_dynamic_data/%x-%j.log
#SBATCH --error=/mnt/petrelfs/share_data/quxiaoye/runs/llama2_random_scale4_112gpus_dynamic_data/%x-%j.log

#SBATCH --partition=MoE
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=64
#SBATCH --mem=0

#SBATCH --nodes=14
#SBATCH --gres=gpu:8
#SBATCH --quotatype=reserved
#SBATCH -x SH-IDCA1404-10-140-54-36

# reserved spot

source ~/anaconda3/bin/activate smoe

{
num_nodes=14 # should match with --nodes
num_gpu_per_node=8 # should match with --gres

# #cpu/#num_gpu_per_node
export OMP_NUM_THREADS=32
export LOGLEVEL=INFO
# export NCCL_DEBUG=INFO
# export TORCH_DISTRIBUTED_DEBUG=DETAIL
# export TORCH_SHOW_CPP_STACKTRACES=1
# export CUDA_LAUNCH_BLOCKING=1

##############################################################
############### LLAMA 7B Moefication 16Experts ###############
# comment="llama 7B residual, gradient, 2 + 2/14 | soft residual 2.0 | soft moe 2.0 | GPU num 1, per-device bs 64, lr 1e-4"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEResidualForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_7B-14Select2-2Residuals-688Neurons-Share

##############################################################
######## LLAMA 2 7B 16 Experts all kinds of ablations ########
# comment="llama 2 7B, residual 2, moefication gradient 2/14 | residual hard, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, moefication gradient 2/14 | residual plain soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, moefication gradient 2/14 | residual learn soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
model_type="llama_moe"
comment="llama 2 7B, random 4/16, one linear layer gate"
pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama2_7B-16Select4-up_proj-Scale4.0

# comment="llama 2 7B, residual 2, share gradient 2/14 | residual hard, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, share gradient 2/14 | residual plain soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, share gradient 2/14 | residual learn soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, share gradient 2/14 | residual learn soft 2.0, moe soft 2.0 | GPU num 16, per-device bs 32, lr 3e-4"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEResidualForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama2_7B-14Select2-2Residuals-688Neurons-Share

##############################################################

tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama2_7B
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed
validation_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized

lr=2e-4
final_lr_portion=0.1
per_device_train_batch_size=8
per_device_eval_batch_size=8
gradient_accumulation_steps=4
block_size=4096
num_tokens="200*10^9"
warmup_tokens="15*10^8"
# warmup_tokens="0"
eval_tokens="2.5*10^9"
seed=1227
deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json

num_selects=4

max_steps=$(echo "${num_tokens} / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
max_train_samples=$(echo "${num_tokens} / ($block_size)" | bc)
echo "max_steps: $max_steps"
echo "max_train_samples: $max_train_samples"
global_bs=$(echo "$per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node" | bc)
echo "global batch size: $global_bs"
tokens_per_batch=$(echo "$global_bs * $block_size" | bc)
echo "#tokens/batch: $tokens_per_batch"
warmup_steps=$(echo "$warmup_tokens / ($tokens_per_batch)" | bc)
echo "warmup tokens: $warmup_tokens, warmup steps: $warmup_steps"
eval_steps=$(echo "$eval_tokens / ($tokens_per_batch)" | bc)
echo "eval interval (tokens): $eval_tokens, steps: $eval_steps"

data_cache=resources/cache
base_dir="/mnt/petrelfs/share_data/quxiaoye/runs/llama2_random_scale4_112gpus_dynamic_data"
output_dir=$base_dir/outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
mkdir -p $output_dir
echo "output_dir: $output_dir"
scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
git diff > $output_dir/diff.patch
env > $output_dir/env
echo $comment > $output_dir/comment.txt
echo "$SLURM_JOB_ID" > $base_dir/latest.jobid
ln -snf $output_dir $base_dir/latest.dir
ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log

nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo "Node: $head_node"
echo "Node IP: $head_node_ip"
echo "Node list: $SLURM_JOB_NODELIS"

srun torchrun \
--nnodes ${num_nodes} \
--nproc_per_node ${num_gpu_per_node} \
--node_rank $SLURM_NODEID \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt/cpt_fpt.py \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--model_type ${model_type} \
--tokenizer_name_or_path ${tokenizer_path} \
--dataset_dir ${dataset_dir} \
--data_cache_dir ${data_cache} \
--validation_dir ${validation_dir} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--do_train \
--evaluation_strategy steps \
--eval_steps ${eval_steps} \
--seed ${seed} \
--bf16 \
--num_train_epochs 1 \
--final_lr_portion ${final_lr_portion} \
--optim adamw_torch \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--learning_rate ${lr} \
--weight_decay 0.1 \
--max_grad_norm 1.0 \
--warmup_steps 100 \
--max_steps ${max_steps} \
--max_train_samples ${max_train_samples} \
--save_strategy steps \
--save_total_limit 1 \
--save_steps ${eval_steps} \
--dataloader_num_workers 0 \
--dataloader_pin_memory True \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--block_size ${block_size} \
--output_dir ${output_dir} \
--overwrite_output_dir \
--ddp_timeout 3600 \
--ddp_find_unused_parameters False \
--torch_dtype bfloat16 \
--gradient_checkpointing \
--logging_first_step True \
--logging_strategy steps \
--logging_steps 5 \
--log_level info \
--log_level_replica warning \
--log_on_each_node False \
--report_to none \
--gate_type "TopKBalancedNoisyGate" \
--calculator_type "UniversalCalculator" \
--num_selects ${num_selects}
}
165 changes: 165 additions & 0 deletions scripts/cpt/dynamic_data_selection/baseline_112gpus_scale2.0.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/bash

#SBATCH --job-name=cpt-llama2_random_scale4_112gpus_dynamic_data
#SBATCH --output=/mnt/petrelfs/share_data/quxiaoye/runs/llama2_random_scale4_112gpus_dynamic_data/%x-%j.log
#SBATCH --error=/mnt/petrelfs/share_data/quxiaoye/runs/llama2_random_scale4_112gpus_dynamic_data/%x-%j.log

#SBATCH --partition=MoE
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=64
#SBATCH --mem=0

#SBATCH --nodes=14
#SBATCH --gres=gpu:8
#SBATCH --quotatype=reserved
#SBATCH -x SH-IDCA1404-10-140-54-36

# reserved spot

source ~/anaconda3/bin/activate smoe

{
num_nodes=14 # should match with --nodes
num_gpu_per_node=8 # should match with --gres

# #cpu/#num_gpu_per_node
export OMP_NUM_THREADS=32
export LOGLEVEL=INFO
# export NCCL_DEBUG=INFO
# export TORCH_DISTRIBUTED_DEBUG=DETAIL
# export TORCH_SHOW_CPP_STACKTRACES=1
# export CUDA_LAUNCH_BLOCKING=1

##############################################################
############### LLAMA 7B Moefication 16Experts ###############
# comment="llama 7B residual, gradient, 2 + 2/14 | soft residual 2.0 | soft moe 2.0 | GPU num 1, per-device bs 64, lr 1e-4"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEResidualForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama_7B-14Select2-2Residuals-688Neurons-Share

##############################################################
######## LLAMA 2 7B 16 Experts all kinds of ablations ########
# comment="llama 2 7B, residual 2, moefication gradient 2/14 | residual hard, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, moefication gradient 2/14 | residual plain soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, moefication gradient 2/14 | residual learn soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
model_type="llama_moe"
comment="llama 2 7B, random 4/16"
pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama2_7B-16Select4-up_proj-Scale4.0

# comment="llama 2 7B, residual 2, share gradient 2/14 | residual hard, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, share gradient 2/14 | residual plain soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, share gradient 2/14 | residual learn soft 8.0, moe soft 8.0 | GPU num 16, per-device bs 32, lr 3e-4"
# comment="llama 2 7B, residual 2, share gradient 2/14 | residual learn soft 2.0, moe soft 2.0 | GPU num 16, per-device bs 32, lr 3e-4"
# pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEResidualForCausalLM/Gradient-max-l1_norm-sample-feature_change/llama2_7B-14Select2-2Residuals-688Neurons-Share

##############################################################

tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama2_7B
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/SlimPajama_processed
validation_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized

lr=2e-4
final_lr_portion=0.1
per_device_train_batch_size=8
per_device_eval_batch_size=8
gradient_accumulation_steps=4
block_size=4096
num_tokens="200*10^9"
warmup_tokens="15*10^8"
# warmup_tokens="0"
eval_tokens="2.5*10^9"
seed=1227
deepspeed_config_file=conf/deepspeed/bf16_zero1_default.json

num_selects=4

max_steps=$(echo "${num_tokens} / ($block_size * $per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node)" | bc)
max_train_samples=$(echo "${num_tokens} / ($block_size)" | bc)
echo "max_steps: $max_steps"
echo "max_train_samples: $max_train_samples"
global_bs=$(echo "$per_device_train_batch_size * $gradient_accumulation_steps * $num_nodes * $num_gpu_per_node" | bc)
echo "global batch size: $global_bs"
tokens_per_batch=$(echo "$global_bs * $block_size" | bc)
echo "#tokens/batch: $tokens_per_batch"
warmup_steps=$(echo "$warmup_tokens / ($tokens_per_batch)" | bc)
echo "warmup tokens: $warmup_tokens, warmup steps: $warmup_steps"
eval_steps=$(echo "$eval_tokens / ($tokens_per_batch)" | bc)
echo "eval interval (tokens): $eval_tokens, steps: $eval_steps"

data_cache=resources/cache
base_dir="/mnt/petrelfs/share_data/quxiaoye/runs/llama2_random_scale4_112gpus_dynamic_data"
output_dir=$base_dir/outputs/$SLURM_JOB_NAME-$SLURM_JOB_ID
mkdir -p $output_dir
echo "output_dir: $output_dir"
scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
git diff > $output_dir/diff.patch
env > $output_dir/env
echo $comment > $output_dir/comment.txt
echo "$SLURM_JOB_ID" > $base_dir/latest.jobid
ln -snf $output_dir $base_dir/latest.dir
ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log

nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo "Node: $head_node"
echo "Node IP: $head_node_ip"
echo "Node list: $SLURM_JOB_NODELIS"

srun torchrun \
--nnodes ${num_nodes} \
--nproc_per_node ${num_gpu_per_node} \
--node_rank $SLURM_NODEID \
--rdzv_id $RANDOM \
--rdzv_backend c10d \
--rdzv_endpoint $head_node:29518 \
smoe/entrypoint/cpt/cpt_fpt.py \
--moe_calculator_score_scale_factor 2.0 \
--deepspeed ${deepspeed_config_file} \
--model_name_or_path ${pretrained_model} \
--model_type ${model_type} \
--tokenizer_name_or_path ${tokenizer_path} \
--dataset_dir ${dataset_dir} \
--data_cache_dir ${data_cache} \
--validation_dir ${validation_dir} \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--do_train \
--evaluation_strategy steps \
--eval_steps ${eval_steps} \
--seed ${seed} \
--bf16 \
--num_train_epochs 1 \
--final_lr_portion ${final_lr_portion} \
--optim adamw_torch \
--adam_beta1 0.9 \
--adam_beta2 0.95 \
--learning_rate ${lr} \
--weight_decay 0.1 \
--max_grad_norm 1.0 \
--warmup_steps 100 \
--max_steps ${max_steps} \
--max_train_samples ${max_train_samples} \
--save_strategy steps \
--save_total_limit 1 \
--save_steps ${eval_steps} \
--dataloader_num_workers 0 \
--dataloader_pin_memory True \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--block_size ${block_size} \
--output_dir ${output_dir} \
--overwrite_output_dir \
--ddp_timeout 3600 \
--ddp_find_unused_parameters False \
--torch_dtype bfloat16 \
--gradient_checkpointing \
--logging_first_step True \
--logging_strategy steps \
--logging_steps 5 \
--log_level info \
--log_level_replica warning \
--log_on_each_node False \
--report_to none \
--gate_type "TopKBalancedNoisyGate" \
--calculator_type "UniversalCalculator" \
--num_selects ${num_selects}
}
Loading

0 comments on commit f569ea8

Please sign in to comment.