From b563ff971d870f7b839e08faf833b286c4cf0fdb Mon Sep 17 00:00:00 2001
From: stceum <50257864+stceum@users.noreply.github.com>
Date: Fri, 8 Dec 2023 22:15:40 +0800
Subject: [PATCH] Update training scripts of step2 DPO in DeepSpeed-Chat.

---
 .../training_scripts/README.md                |  2 +-
 .../training_scripts/llama2/run_llama2_7b.sh  | 13 +++---
 .../llama2/run_llama2_7b_lora.sh              | 13 +++---
 .../run_6.7b.sh => multi_node/run_350m.sh}    | 16 +++----
 .../single_gpu/{run_1.3b.sh => run_350m.sh}   |  6 +--
 .../opt/single_gpu/run_6.7b_lora.sh           | 31 -------------
 .../opt/single_node/run_1.3b.sh               | 35 ---------------
 .../opt/single_node/run_1.3b_lora.sh          | 31 -------------
 .../opt/single_node/run_13b.sh                | 36 ----------------
 .../opt/single_node/run_30b_lora.sh           | 28 ------------
 .../run_66b.sh => single_node/run_350m.sh}    | 12 +++---
 .../opt/single_node/sweep/README.md           |  7 ++-
 .../opt/single_node/sweep/run_single.sh       | 43 +++++--------------
 .../opt/single_node/sweep/run_step1_sweep.sh  | 25 -----------
 .../opt/single_node/sweep/run_step2_sweep.sh  | 21 +++++++++
 .../other_language/run_chinese.sh             | 36 ----------------
 .../other_language/run_japanese.sh            | 36 ----------------
 17 files changed, 65 insertions(+), 326 deletions(-)
 rename applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/{single_node/run_6.7b.sh => multi_node/run_350m.sh} (72%)
 rename applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/{run_1.3b.sh => run_350m.sh} (75%)
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b.sh
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_13b.sh
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
 rename applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/{multi_node/run_66b.sh => single_node/run_350m.sh} (78%)
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh
 create mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_chinese.sh
 delete mode 100644 applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_japanese.sh

diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md
index f680397af..ca2d5eb70 100644
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/README.md
@@ -1,6 +1,6 @@
 ### 💁For each folder, the bash scripts are examples of "facebook/opt" family.
 
 If you want to change your model such as EleutherAI/gpt-j-6b, you may simply replace
-`` --model_name_or_path facebook/opt-1.3b`` to ``--model_name_or_path EleutherAI/gpt-j-6b ``.
+`` --model_name_or_path facebook/opt-350m`` to ``--model_name_or_path EleutherAI/gpt-neo-125m ``.
 
 For the models we support, please see [our landing page](./../../../README.md#-supported-models-)
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh
index 2fe70be13..aaa059bbe 100755
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b.sh
@@ -6,7 +6,7 @@
 OUTPUT=$1
 ZERO_STAGE=$2
 if [ "$OUTPUT" == "" ]; then
-    OUTPUT=./output_step1_llama2_7b
+    OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
 fi
 if [ "$ZERO_STAGE" == "" ]; then
     ZERO_STAGE=3
@@ -14,15 +14,15 @@ fi
 mkdir -p $OUTPUT
 
 deepspeed main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
+   --data_path Dahoas/rm-static \
    --data_split 2,4,4 \
    --model_name_or_path meta-llama/Llama-2-7b-hf \
-   --per_device_train_batch_size 4 \
-   --per_device_eval_batch_size 4 \
+   --per_device_train_batch_size 8 \
+   --per_device_eval_batch_size 8 \
    --max_seq_len 512 \
    --learning_rate 9.65e-6 \
-   --weight_decay 0. \
-   --num_train_epochs 4  \
+   --weight_decay 0.1 \
+   --num_train_epochs 1  \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \
@@ -30,5 +30,6 @@ deepspeed main.py \
    --gradient_checkpointing \
    --zero_stage $ZERO_STAGE \
    --deepspeed \
+   --offload \
    --output_dir $OUTPUT \
    &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
index 7689266ee..ec48de78a 100755
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/llama2/run_llama2_7b_lora.sh
@@ -6,7 +6,7 @@
 OUTPUT=$1
 ZERO_STAGE=$2
 if [ "$OUTPUT" == "" ]; then
-    OUTPUT=./output_step1_llama2_7b_lora
+    OUTPUT=./output_step2_llama_7b_epoch1_lr9.65e-6
 fi
 if [ "$ZERO_STAGE" == "" ]; then
     ZERO_STAGE=3
@@ -14,15 +14,15 @@ fi
 mkdir -p $OUTPUT
 
 deepspeed main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
+   --data_path Dahoas/rm-static \
    --data_split 2,4,4 \
    --model_name_or_path meta-llama/Llama-2-7b-hf \
-   --per_device_train_batch_size 4 \
-   --per_device_eval_batch_size 4 \
+   --per_device_train_batch_size 8 \
+   --per_device_eval_batch_size 8 \
    --max_seq_len 512 \
    --learning_rate 9.65e-6 \
-   --weight_decay 0. \
-   --num_train_epochs 4  \
+   --weight_decay 0.1 \
+   --num_train_epochs 1  \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \
@@ -30,6 +30,7 @@ deepspeed main.py \
    --gradient_checkpointing \
    --zero_stage $ZERO_STAGE \
    --deepspeed \
+   --offload \
    --lora_dim 128 \
    --lora_module_name "layers." \
    --output_dir $OUTPUT \
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_6.7b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh
similarity index 72%
rename from applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_6.7b.sh
rename to applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh
index 126f8892f..b55beef98 100644
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_6.7b.sh
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_350m.sh
@@ -9,25 +9,25 @@ if [ "$OUTPUT" == "" ]; then
     OUTPUT=./output
 fi
 if [ "$ZERO_STAGE" == "" ]; then
-    ZERO_STAGE=3
+    ZERO_STAGE=0
 fi
 mkdir -p $OUTPUT
 
 deepspeed main.py \
    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
    --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-6.7b \
-   --per_device_train_batch_size 6 \
-   --per_device_eval_batch_size 6 \
+   --model_name_or_path facebook/opt-350m \
+   --per_device_train_batch_size 2 \
+   --per_device_eval_batch_size 2 \
    --max_seq_len 512 \
-   --learning_rate 9.65e-6 \
-   --weight_decay 0. \
-   --num_train_epochs 16  \
+   --learning_rate 5e-5 \
+   --weight_decay 0.1 \
+   --dropout 0.0 \
+   --num_train_epochs 1 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \
    --seed 1234 \
-   --gradient_checkpointing \
    --zero_stage $ZERO_STAGE \
    --deepspeed \
    --output_dir $OUTPUT \
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh
similarity index 75%
rename from applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
rename to applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh
index a0a2fddc9..8157865a5 100644
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_1.3b.sh
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_350m.sh
@@ -3,8 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-
-# Note that usually LoRA needs to use larger learning rate
 OUTPUT=$1
 ZERO_STAGE=$2
 if [ "$OUTPUT" == "" ]; then
@@ -15,8 +13,8 @@ if [ "$ZERO_STAGE" == "" ]; then
 fi
 mkdir -p $OUTPUT
 
-deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-1.3b \
-   --gradient_accumulation_steps 8 --lora_dim 128 --zero_stage $ZERO_STAGE \
+deepspeed --num_gpus 1 main.py --model_name_or_path facebook/opt-350m \
+   --weight_decay 0.1 --dropout 0.0 --gradient_accumulation_steps 4 --zero_stage $ZERO_STAGE \
    --enable_tensorboard \
    --tensorboard_path $OUTPUT \
    --deepspeed --output_dir $OUTPUT &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
deleted file mode 100644
index d4189bb1e..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_gpu/run_6.7b_lora.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-# Note that usually LoRA needs to use larger learning rate
-OUTPUT_PATH=./output
-mkdir -p $OUTPUT_PATH
-
-deepspeed --num_gpus 1 main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
-   --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-6.7b \
-   --per_device_train_batch_size 8 \
-   --per_device_eval_batch_size 8 \
-   --max_seq_len 512 \
-   --learning_rate 1e-3 \
-   --weight_decay 0. \
-   --num_train_epochs 16 \
-   --gradient_accumulation_steps 16 \
-   --lr_scheduler_type cosine \
-   --num_warmup_steps 0 \
-   --seed 1234 \
-   --gradient_checkpointing \
-   --zero_stage 0 \
-   --lora_dim 128 \
-   --lora_module_name decoder.layers. \
-   --deepspeed \
-   --output_dir $OUTPUT_PATH \
-   &> $OUTPUT_PATH/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b.sh
deleted file mode 100644
index 3eeeefe02..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-OUTPUT=$1
-ZERO_STAGE=$2
-if [ "$OUTPUT" == "" ]; then
-    OUTPUT=./output
-fi
-if [ "$ZERO_STAGE" == "" ]; then
-    ZERO_STAGE=2
-fi
-mkdir -p $OUTPUT
-
-deepspeed main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
-   --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-1.3b \
-   --per_device_train_batch_size 8 \
-   --per_device_eval_batch_size 8 \
-   --max_seq_len 512 \
-   --learning_rate 9.65e-6 \
-   --weight_decay 0. \
-   --num_train_epochs 16 \
-   --gradient_accumulation_steps 1 \
-   --lr_scheduler_type cosine \
-   --num_warmup_steps 0 \
-   --seed 1234 \
-   --zero_stage $ZERO_STAGE \
-   --deepspeed \
-   --enable_tensorboard \
-   --tensorboard_path $OUTPUT \
-   --output_dir $OUTPUT \
-   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
deleted file mode 100644
index c0057812c..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_1.3b_lora.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-
-# Note that usually LoRA needs to use larger learning rate
-OUTPUT_PATH=./output
-mkdir -p $OUTPUT_PATH
-
-deepspeed main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
-   --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-1.3b \
-   --per_device_train_batch_size 8 \
-   --per_device_eval_batch_size 8 \
-   --max_seq_len 512 \
-   --learning_rate 1e-3 \
-   --weight_decay 0.1 \
-   --num_train_epochs 16 \
-   --gradient_accumulation_steps 1 \
-   --lr_scheduler_type cosine \
-   --num_warmup_steps 0 \
-   --seed 1234 \
-   --zero_stage 0 \
-   --lora_dim 128 \
-   --lora_module_name decoder.layers. \
-   --only_optimize_lora \
-   --deepspeed \
-   --output_dir $OUTPUT_PATH \
-   &> $OUTPUT_PATH/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_13b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_13b.sh
deleted file mode 100644
index f93f1f9f7..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_13b.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-OUTPUT=$1
-ZERO_STAGE=$2
-if [ "$OUTPUT" == "" ]; then
-    OUTPUT=./output
-fi
-if [ "$ZERO_STAGE" == "" ]; then
-    ZERO_STAGE=3
-fi
-mkdir -p $OUTPUT
-
-deepspeed main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
-   --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-13b \
-   --per_device_train_batch_size 4 \
-   --per_device_eval_batch_size 4 \
-   --max_seq_len 512 \
-   --learning_rate 1e-4 \
-   --weight_decay 0. \
-   --num_train_epochs 16  \
-   --gradient_accumulation_steps 1 \
-   --lr_scheduler_type cosine \
-   --num_warmup_steps 0 \
-   --seed 1234 \
-   --gradient_checkpointing \
-   --zero_stage $ZERO_STAGE \
-   --lora_dim 128 \
-   --lora_module_name decoder.layers. \
-   --deepspeed \
-   --output_dir $OUTPUT \
-   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_30b_lora.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
deleted file mode 100644
index 661279c5f..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_30b_lora.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-OUTPUT_PATH=./output
-mkdir -p $OUTPUT_PATH
-
-deepspeed main.py \
-   --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
-   --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-30b \
-   --per_device_train_batch_size 4 \
-   --per_device_eval_batch_size 4 \
-   --max_seq_len 512 \
-   --learning_rate 9.65e-6 \
-   --weight_decay 0. \
-   --num_train_epochs 16  \
-   --gradient_accumulation_steps 1 \
-   --lr_scheduler_type cosine \
-   --num_warmup_steps 0 \
-   --seed 1234 \
-   --lora_dim 128 \
-   --gradient_checkpointing \
-   --zero_stage 3 \
-   --deepspeed \
-   --output_dir $OUTPUT_PATH \
-   &> $OUTPUT_PATH/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_66b.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh
similarity index 78%
rename from applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_66b.sh
rename to applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh
index 4df99382a..16aed6a42 100644
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/multi_node/run_66b.sh
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/run_350m.sh
@@ -9,28 +9,26 @@ if [ "$OUTPUT" == "" ]; then
     OUTPUT=./output
 fi
 if [ "$ZERO_STAGE" == "" ]; then
-    ZERO_STAGE=3
+    ZERO_STAGE=0
 fi
 mkdir -p $OUTPUT
 
 deepspeed main.py \
    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
    --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-66b \
+   --model_name_or_path facebook/opt-350m \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 4 \
    --max_seq_len 512 \
-   --learning_rate 1e-4 \
+   --learning_rate 5e-5 \
    --weight_decay 0.1 \
-   --num_train_epochs 2  \
+   --num_train_epochs 1 \
+   --dropout 0.0 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \
    --seed 1234 \
-   --gradient_checkpointing \
    --zero_stage $ZERO_STAGE \
-   --lora_dim 128 \
-   --lora_module_name decoder.layers. \
    --deepspeed \
    --output_dir $OUTPUT \
    &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md
index 254442faf..1f90b9f65 100644
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/README.md
@@ -5,17 +5,16 @@
    * [Usage](#usage)
 
 # Introduction
-The step 1 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
+The step 2 characterization script sweeps across various training parameters. Currently, the following parameters are swept:
 <pre>
 Zero Stage: 2, 3
 Offload: True, False
-Lora: True, False
 </pre>
 
-The `run_step1_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
+The `run_step2_sweep.sh` script passes configuration arguments to `run_single.sh`, which can be extended to sweep beyond the parameters listed above (e.g. learning rate, weight decay, etc).
 
 # Usage
 The sweep script can be run as follows:
 <pre>
-DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning$ bash training_scripts/opt/single_node/sweep/run_step1_sweep.sh
+DeepSpeedExamples/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning$ bash training_scripts/opt/single_node/sweep/run_step2_sweep.sh
 </pre>
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
index 1590128bb..6f5453af1 100644
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_single.sh
@@ -2,55 +2,35 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
-# DeepSpeed Team
-
-# Note that usually LoRA needs to use larger learning rate
 # DeepSpeed Team
 ZERO_STAGE=$1
 OFFLOAD=$2
-LORA=$3
-OUTPUT=$4
-
+OUTPUT=$3
 if [ "$ZERO_STAGE" == "" ]; then
-    ZERO_STAGE=2
+    ZERO_STAGE=0
 fi
-
 if [ "$OFFLOAD" == true ]; then
     OFFLOAD="--offload"
 else
     OFFLOAD=""
 fi
-
-if [ "$LORA" == true ]; then
-    LORA_DIM="--lora_dim 128"
-    LORA_MODULE_NAME="--lora_module_name decoder.layers."
-    ONLY_OPTIMIZE_LORA="--only_optimize_lora"
-    LEARNING_RATE="1e-3"
-    WEIGHT_DECAY="0.1"
-else
-    LORA_DIM="--lora_dim 0"
-    LORA_MODULE_NAME=""
-    ONLY_OPTIMIZE_LORA=""
-    LEARNING_RATE="9.65e-6"
-    WEIGHT_DECAY="0."
-fi
-
 if [ "$OUTPUT" == "" ]; then
     OUTPUT=./output
 fi
-
 mkdir -p $OUTPUT
 
 cmd="deepspeed main.py \
    --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
    --data_split 2,4,4 \
-   --model_name_or_path facebook/opt-1.3b \
-   --per_device_train_batch_size 8 \
-   --per_device_eval_batch_size 8 \
+   --model_name_or_path facebook/opt-350m \
+   --num_padding_at_beginning 1 \
+   --per_device_train_batch_size 4 \
+   --per_device_eval_batch_size 4 \
    --max_seq_len 512 \
-   --learning_rate ${LEARNING_RATE} \
-   --weight_decay ${WEIGHT_DECAY} \
-   --num_train_epochs 16 \
+   --learning_rate 5e-5 \
+   --weight_decay 0.1 \
+   --num_train_epochs 1 \
+   --dropout 0.0 \
    --gradient_accumulation_steps 1 \
    --lr_scheduler_type cosine \
    --num_warmup_steps 0 \
@@ -58,8 +38,7 @@ cmd="deepspeed main.py \
    --zero_stage $ZERO_STAGE \
    --deepspeed \
    --output_dir $OUTPUT \
-   $OFFLOAD $LORA_DIM $LORA_MODULE_NAME \
-   $ONLY_OPTIMIZE_LORA"
+   $OFFLOAD"
 
 echo "----------------------------- DS COMMAND -----------------------------"
 echo $cmd
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh
deleted file mode 100644
index 7b6e57823..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step1_sweep.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-for z in {2..3}
-do
-    for offload in true false
-    do
-        for lora in true false
-        do
-            cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
-                ${z} \
-                ${offload} \
-                ${lora} \
-                z${z}_offload_${offload}_lora_${lora}"
-            echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
-            echo $cmd
-            $cmd
-            pkill -9 python
-            sleep 60
-            echo ""
-        done
-    done
-done
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
new file mode 100644
index 000000000..ad9849e38
--- /dev/null
+++ b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/opt/single_node/sweep/run_step2_sweep.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+for z in {2..3}
+do
+    for offload in true false
+    do
+        cmd="bash training_scripts/opt/single_node/sweep/run_single.sh \
+            ${z} \
+            ${offload} \
+            z${z}_offload_${offload}"
+        echo "----------------------------- CALLING SHELL SCRIPT -----------------------------"
+        echo $cmd
+        $cmd
+        pkill -9 python
+        sleep 60
+        echo ""
+    done
+done
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_chinese.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_chinese.sh
deleted file mode 100644
index 88d424100..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_chinese.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-OUTPUT=$1
-ZERO_STAGE=$2
-if [ "$OUTPUT" == "" ]; then
-    OUTPUT=./output
-fi
-if [ "$ZERO_STAGE" == "" ]; then
-    ZERO_STAGE=2
-fi
-mkdir -p $OUTPUT
-
-# The Chinese data we found mostly only contain one response without another
-# "rejected" response. Thus we only test the step 1 finetuning and use
-# a data_split of 10,0,0 (keep all data for step 1).
-deepspeed main.py \
-   --data_path wangrui6/Zhihu-KOL Cohere/miracl-zh-queries-22-12 Hello-SimpleAI/HC3-Chinese mkqa-Chinese \
-   --data_split 10,0,0 \
-   --model_name_or_path bigscience/bloom-1b1 \
-   --per_device_train_batch_size 8 \
-   --per_device_eval_batch_size 8 \
-   --max_seq_len 512 \
-   --learning_rate 9.65e-6 \
-   --weight_decay 0. \
-   --num_train_epochs 16 \
-   --gradient_accumulation_steps 1 \
-   --lr_scheduler_type cosine \
-   --num_warmup_steps 0 \
-   --seed 1234 \
-   --zero_stage $ZERO_STAGE \
-   --deepspeed \
-   --output_dir $OUTPUT \
-   &> $OUTPUT/training.log
diff --git a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_japanese.sh b/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_japanese.sh
deleted file mode 100644
index 7b0646244..000000000
--- a/applications/DeepSpeed-Chat/training/step2_dpo_finetuning/training_scripts/other_language/run_japanese.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-OUTPUT=$1
-ZERO_STAGE=$2
-if [ "$OUTPUT" == "" ]; then
-    OUTPUT=./output
-fi
-if [ "$ZERO_STAGE" == "" ]; then
-    ZERO_STAGE=2
-fi
-mkdir -p $OUTPUT
-
-# The Japanese data we found mostly only contain one response without another
-# "rejected" response. Thus we only test the step 1 finetuning and use
-# a data_split of 10,0,0 (keep all data for step 1).
-deepspeed main.py \
-   --data_path mkqa-Japanese Cohere/miracl-ja-queries-22-12 lmqg/qg_jaquad lmqg/qag_jaquad \
-   --data_split 10,0,0 \
-   --model_name_or_path sberbank-ai/mGPT \
-   --per_device_train_batch_size 8 \
-   --per_device_eval_batch_size 8 \
-   --max_seq_len 512 \
-   --learning_rate 9.65e-6 \
-   --weight_decay 0. \
-   --num_train_epochs 16 \
-   --gradient_accumulation_steps 1 \
-   --lr_scheduler_type cosine \
-   --num_warmup_steps 0 \
-   --seed 1234 \
-   --zero_stage $ZERO_STAGE \
-   --deepspeed \
-   --output_dir $OUTPUT \
-   &> $OUTPUT/training.log