microsoft · xiaoxiawu-microsoft · May 19, 2023 · May 19, 2023 · May 19, 2023 · May 19, 2023
@@ -137,9 +137,10 @@ def parse_args():
     parser.add_argument('--gradient_checkpointing',
                         action='store_true',
                         help='Enable HF gradient checkpointing for model.')
-    parser.add_argument('--disable_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the model.')
+    parser.add_argument('--dropout',
+                        type=float,
+                        default=0.1,
+                        help='make the dropout of the model by default 0.1.')
     # deepspeed features
     parser.add_argument('--offload',
                         action='store_true',
@@ -209,7 +210,7 @@ def main():
                             args.model_name_or_path,
                             tokenizer,
                             ds_config,
-                            disable_dropout=args.disable_dropout)
+                            dropout=args.dropout)
 
     if args.lora_dim > 0:
         model = convert_linear_layer_to_lora(model, args.lora_module_name,

@@ -137,9 +137,10 @@ def parse_args():
         '--gradient_checkpointing',
         action='store_true',
         help='Enable HF gradient checkpointing for Actor model.')
-    parser.add_argument('--disable_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the model.')
+    parser.add_argument('--dropout',
+                        type=float,
+                        default=0.1,
+                        help='make the dropout of the model by default 0.1.')
     # deepspeed features
     parser.add_argument('--offload',
                         action='store_true',
@@ -208,7 +209,7 @@ def main():
                                    tokenizer,
                                    ds_config,
                                    args.num_padding_at_beginning,
-                                   disable_dropout=args.disable_dropout)
+                                   dropout=args.dropout)
 
     if args.lora_dim > 0:
         rm_model = convert_linear_layer_to_lora(rm_model,

@@ -23,7 +23,7 @@ deepspeed main.py \
    --max_seq_len 512 \
    --learning_rate 5e-5 \
    --weight_decay 0.1 \
-   --disable_dropout \
+   --dropout 0.0 \
    --num_train_epochs 1 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \

@@ -14,5 +14,5 @@ fi
 mkdir -p $OUTPUT
 
 deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
-   --num_padding_at_beginning 1 --weight_decay 0.1 --disable_dropout --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
+   --num_padding_at_beginning 1 --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
    --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
@@ -24,7 +24,7 @@ deepspeed main.py \
    --learning_rate 5e-5 \
    --weight_decay 0.1 \
    --num_train_epochs 1 \
-   --disable_dropout \
+   --dropout 0.0 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \

@@ -255,12 +255,14 @@ def parse_args():
         '--critic_gradient_checkpointing',
         action='store_true',
         help='Enable HF gradient checkpointing for Critic model.')
-    parser.add_argument('--disable_actor_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the actor model.')
-    parser.add_argument('--disable_critic_dropout',
-                        action='store_true',
-                        help='Disable the dropout of the critical model.')
+    parser.add_argument('--actor_dropout',
+                        type=float,
+                        default=0.1,
+                        help='make the dropout of the model by default 0.1.')
+    parser.add_argument('--critic_dropout',
+                        type=float,
+                        default=0.1,
+                        help='make the dropout of the model by default 0.1.')
     ## LoRA for efficient training setting
     parser.add_argument("--actor_lora_dim",
                         type=int,

@@ -88,7 +88,7 @@ def _init_actor(self, actor_model_name_or_path):
             model_name_or_path=actor_model_name_or_path,
             tokenizer=self.tokenizer,
             ds_config=ds_config,
-            disable_dropout=self.args.disable_actor_dropout)
+            dropout=self.args.actor_dropout)
 
         # LoRA
         if self.args.actor_lora_dim > 0:
@@ -203,7 +203,7 @@ def _init_critic(self, critic_model_name_or_path):
             ds_config=ds_eval_config,
             num_padding_at_beginning=self.args.num_padding_at_beginning,
             rlhf_training=True,
-            disable_dropout=self.args.disable_critic_dropout)
+            dropout=self.args.critic_dropout)
 
         # LoRA
         if self.args.critic_lora_dim > 0:

@@ -51,7 +51,7 @@ deepspeed --master_port 12346 main.py \
    --actor_zero_stage $ACTOR_ZERO_STAGE \
    --critic_zero_stage $CRITIC_ZERO_STAGE \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0\
    --actor_lora_dim 128 \
    --actor_lora_module_name decoder.layers. \
    --output_dir $OUTPUT \

@@ -23,5 +23,5 @@ deepspeed --num_gpus 1 main.py \
    --actor_model_name_or_path $ACTOR_MODEL_PATH --critic_model_name_or_path $CRITIC_MODEL_PATH \
    --actor_zero_stage $ACTOR_ZERO_STAGE --critic_zero_stage $CRITIC_ZERO_STAGE \
    --num_padding_at_beginning 1 --gradient_accumulation_steps 2 \
-   --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --disable_actor_dropout \
+   --deepspeed --actor_lora_dim 128 --enable_hybrid_engine --actor_gradient_checkpointing --actor_dropout 0.0\
    --output_dir $OUTPUT &> $OUTPUT/training.log
@@ -44,7 +44,7 @@ deepspeed --num_gpus 1 main.py \
    --actor_lora_dim 128 \
    --actor_gradient_checkpointing \
    --critic_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --enable_hybrid_engine \
    --output_dir $OUTPUT \
     &> $OUTPUT/training.log
@@ -41,7 +41,7 @@ deepspeed --master_port 12346 main.py \
    --num_train_epochs 1 \
    --lr_scheduler_type cosine \
    --gradient_accumulation_steps 1 \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
    --enable_hybrid_engine \

@@ -38,7 +38,7 @@ deepspeed --master_port 12346 main.py \
    --gradient_accumulation_steps 1 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    ${ACTOR_ZERO_STAGE} \
    ${CRITIC_ZERO_STAGE} \
    --actor_lora_dim 128 \

@@ -48,7 +48,7 @@ deepspeed --master_port 12346 main.py \
    --actor_zero_stage $ACTOR_ZERO_STAGE \
    --critic_zero_stage $CRITIC_ZERO_STAGE \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --actor_lora_dim 128 \
    --actor_lora_module_name decoder.layers. \
    --output_dir $OUTPUT \

@@ -38,7 +38,7 @@ deepspeed --master_port 12346 main.py \
    --lr_scheduler_type cosine \
    --gradient_accumulation_steps 1 \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
    ${ACTOR_ZERO_STAGE} \

@@ -44,7 +44,7 @@ deepspeed --master_port 12346 main.py \
    --lr_scheduler_type cosine \
    --gradient_accumulation_steps 1 \
    --actor_gradient_checkpointing \
-   --disable_actor_dropout \
+   --actor_dropout 0.0 \
    --num_warmup_steps 100 \
    --deepspeed --seed 1234 \
    --enable_hybrid_engine \

@@ -20,10 +20,10 @@ def create_hf_model(model_class,
                     tokenizer,
                     ds_config=None,
                     rlhf_training=False,
-                    disable_dropout=False):
+                    dropout=0.1):
     model_config = AutoConfig.from_pretrained(model_name_or_path)
-    if disable_dropout:
-        model_config.dropout = 0.0
+    if dropout != model_config.dropout:
+        model_config.dropout = dropout
     # Note: dschf is defined in function scope to avoid global effects
     # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration
     if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3:
@@ -53,11 +53,11 @@ def create_critic_model(model_name_or_path,
                         ds_config,
                         num_padding_at_beginning=0,
                         rlhf_training=False,
-                        disable_dropout=False):
+                        dropout=0.1):
     # OPT model family always put a padding token at the beginning of the sequence,
     # we did not see this in other models but not sure if it is a general rule
     critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer,
-                                   ds_config, rlhf_training, disable_dropout)
+                                   ds_config, rlhf_training, dropout)
     critic_model = RewardModel(
         critic_model,
         tokenizer,