Skip to content

Commit

Permalink
Modify codes so that different accelerators can be called according t…
Browse files Browse the repository at this point in the history
…o specific device conditions (microsoft#844)

* modify inference-test.py to meet with the requirement of using Intel's device

* modify ds-hf-compare.py to meet with the requirement of using Intel's device

* use deepspeed.accelerator.get_accelerator() to replace the original hard code about cuda to access and enable the accelerators available(not just Nvidia's GPU) in the current device

* column 117: self.model.xpu().to(self.device)--->self.model.to(self.device) for generalization.

* For upstream, use get_accelerator() to hide backend. Add bf16 dtype for cpu.

* Update README.md

* Delete redundant comment code

* Delete +123 in README title

* delete checkpoints.json

* modify inference-test.py

* modify inference-test.py v2

* modify inference.py v3

* add bfloat16 for cpu

* fix an error in setup commands with conda

---------

Co-authored-by: Olatunji Ruwase <[email protected]>
  • Loading branch information
2 people authored and stceum committed Jan 27, 2024
1 parent fae277a commit 94677c7
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 8 deletions.
2 changes: 1 addition & 1 deletion inference/huggingface/text-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ If you are using conda, the following works:
conda create -c conda-forge -n deepspeed python=3.10
conda activate deepspeed
pip install -r requirements.txt
deepspeed --num_gpus 1 inference-test.py --name bigscience/bloom-3b --batch_size 2
deepspeed --num_gpus 1 inference-test.py --model bigscience/bloom-3b --batch_size 2
</pre>

# Inference Test
Expand Down
2 changes: 1 addition & 1 deletion inference/huggingface/text-generation/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
parser.add_argument("--checkpoint_path", required=False, default=None, type=str, help="model checkpoint path")
parser.add_argument("--save_mp_checkpoint_path", required=False, default=None, type=str, help="save-path to store the new model checkpoint")
parser.add_argument("--batch_size", default=1, type=int, help="batch size")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8", "bfloat16"], help="data-type")
parser.add_argument("--hf_baseline", action='store_true', help="disable DeepSpeed inference")
parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection")
parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache")
Expand Down
5 changes: 3 additions & 2 deletions inference/huggingface/text-generation/ds-hf-compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from transformers import pipeline
from difflib import SequenceMatcher
from argparse import ArgumentParser
from deepspeed.accelerator import get_accelerator

parser = ArgumentParser()

parser.add_argument("--model", required=True, type=str, help="model_name")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8", "bfloat16"], help="data-type")
parser.add_argument("--num_inputs", default=1, type=int, help="number of test inputs")
parser.add_argument("--min_length", default=200, type=int, help="minimum tokens generated")
parser.add_argument("--max_length", default=300, type=int, help="maximum tokens generated")
Expand Down Expand Up @@ -73,7 +74,7 @@ def string_similarity(str1, str2):
inputs = test_inputs

data_type = getattr(torch, args.dtype)
pipe = pipeline('text-generation', args.model, torch_dtype=data_type, device=0)
pipe = pipeline('text-generation', args.model, torch_dtype=data_type, device=torch.device(get_accelerator().device_name(0)))

base_out_list = []
match_count=0
Expand Down
5 changes: 3 additions & 2 deletions inference/huggingface/text-generation/inference-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import time
from utils import DSPipeline, Performance
from deepspeed.runtime.utils import see_memory_usage
from deepspeed.accelerator import get_accelerator
from arguments import parser

args = parser.parse_args()
Expand Down Expand Up @@ -76,12 +77,12 @@
iters = 30 if args.test_performance else 2 #warmup
times = []
for i in range(iters):
torch.cuda.synchronize()
get_accelerator().synchronize()
start = time.time()
outputs = pipe(inputs,
num_tokens=args.max_new_tokens,
do_sample=(not args.greedy))
torch.cuda.synchronize()
get_accelerator().synchronize()
end = time.time()
times.append(end - start)
print(f"generation time is {times[1]} sec")
Expand Down
5 changes: 3 additions & 2 deletions inference/huggingface/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import torch
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast
from deepspeed.accelerator import get_accelerator

class DSPipeline():
'''
Expand All @@ -34,7 +35,7 @@ def __init__(self,
elif device < 0:
self.device = torch.device("cpu")
else:
self.device = torch.device(f"cuda:{device}")
self.device = torch.device(get_accelerator().device_name(device))

# the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.
self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"]
Expand Down Expand Up @@ -110,7 +111,7 @@ def generate_outputs(self,
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(self.device)

self.model.cuda().to(self.device)
self.model.to(self.device)

if isinstance(self.tokenizer, LlamaTokenizerFast):
# NOTE: Check if Llamma can work w/ **input_tokens
Expand Down

0 comments on commit 94677c7

Please sign in to comment.