add more on benchmark and fix prompt model loading (#296)

* add more on benchmark and fix prompt model loading * fix format --------- Co-authored-by: Michael Wyatt <[email protected]>
microsoft · Apr 17, 2023 · 2aa7a31 · 2aa7a31
1 parent 283e8b2
commit 2aa7a31
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 22 deletions.
diff --git a/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py b/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/prompt_eval.py
@@ -5,13 +5,19 @@
 import argparse
 import logging
 import torch
+import sys
+import os
 
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
 )
 
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
+from utils.model.model_utils import create_hf_model
+
 logger = logging.getLogger(__name__)
 
 
@@ -75,23 +81,6 @@ def parse_args():
     return args
 
 
-def get_model(config, model_path, tokenizer):
-
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        from_tf=bool(".ckpt" in model_path),
-        config=config,
-    )
-    model.resize_token_embeddings(len(tokenizer))
-
-    # prepare the tokenizer and model config
-    tokenizer.pad_token = tokenizer.eos_token
-    model.config.end_token_id = tokenizer.eos_token_id
-    model.config.pad_token_id = model.config.eos_token_id
-
-    return model
-
-
 def generate(model,
              tokenizer,
              inputs,
@@ -212,10 +201,12 @@ def main():
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path_baseline,
                                               fast_tokenizer=True)
 
-    model_baseline = get_model(config, args.model_name_or_path_baseline,
-                               tokenizer)
-    model_fintuned = get_model(config, args.model_name_or_path_finetune,
-                               tokenizer)
+    model_baseline = create_hf_model(AutoModelForCausalLM,
+                                     args.model_name_or_path_baseline,
+                                     tokenizer, None)
+    model_fintuned = create_hf_model(AutoModelForCausalLM,
+                                     args.model_name_or_path_finetune,
+                                     tokenizer, None)
 
     model_baseline.to(device)
     model_fintuned.to(device)

diff --git a/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md b/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md
@@ -4,7 +4,7 @@ As stated in [Blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/dee
 
 > ***Very Important Details***: The numbers in both Table 1 and 2 of the blog are for Step 3 of the training and based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs).  We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](./BenckmarkSetting.md) page for more details.
 
-an apple-to-apple comparison is critical for the machine learning community, particularly for benchmarking. Therefore, we here elaborate more on details.
+an apple-to-apple comparison is critical for the machine learning community, particularly for benchmarking. For example, it is not fair to compare DeepSpeed-Chat end-to-end training time to Alpaca and Vicuna (both focus on instruct finetuning) since they do not have the full RLHF training pipeline. Therefore, we here elaborate more on details.
 
 We randomly select 40% training data from the six open-sourced training datasets, i.e., ``"Dahoas/rm-static", "Dahoas/full-hh-rlhf", "Dahoas/synthetic-instruct-gptj-pairwise", "yitingxie/rlhf-reward-datasets", "openai/webgpt_comparisons"``, and ``"stanfordnlp/SHP"``. The total training samples we have is 264,292. We fix the query (prompt) sequence length as **256** and generate fixed-length answer with **256** tokens. As such, the total training tokens per epoch is 135,317,504. During benchmark testing, we set the training epoch number as 1.