From 8325fd692adc892ff9febaad75e4ca8d37fc0f27 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Wed, 3 Nov 2021 23:28:31 +0500 Subject: [PATCH 1/5] add T5 example using tensor-parallelism --- inference/huggingface/test-t5.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 inference/huggingface/test-t5.py diff --git a/inference/huggingface/test-t5.py b/inference/huggingface/test-t5.py new file mode 100644 index 000000000..2890d9b94 --- /dev/null +++ b/inference/huggingface/test-t5.py @@ -0,0 +1,28 @@ +from transformers import pipeline +import transformers +import deepspeed +import torch +import os +from transformers.models.t5.modeling_t5 import T5Block + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '4')) + +pipe = pipeline("text2text-generation", model="bigscience/T0_3B") + +# The inpjection_policy shows two things: +# 1. which layer module we need to add Tensor-Parallelism +# 2. the name of two linear layers: a) attention_output, and b) transformer output + +pipe.model = deepspeed.init_inference( + pipe.model, + mp_size=world_size, + dtype=torch.float, + injection_policy={T5Block: ('o', 'wo')} +) + +pipe.device = torch.device(f'cuda:{local_rank}') +output = pipe("Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy") + +if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + print(output) From c97af8be09d6114f2f4ed574067aee2dd4fc31a9 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 4 Nov 2021 00:39:09 +0500 Subject: [PATCH 2/5] refine t5 test --- inference/huggingface/test-t5.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inference/huggingface/test-t5.py b/inference/huggingface/test-t5.py index 2890d9b94..32e62ab14 100644 --- a/inference/huggingface/test-t5.py +++ b/inference/huggingface/test-t5.py @@ -8,17 +8,18 @@ local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '4')) -pipe = pipeline("text2text-generation", model="bigscience/T0_3B") +pipe = pipeline("text2text-generation", model="google/t5-v1_1-small", device=local_rank) # The inpjection_policy shows two things: # 1. which layer module we need to add Tensor-Parallelism -# 2. the name of two linear layers: a) attention_output, and b) transformer output +# 2. the name of several linear layers: a) attention_output (both encoder and decoder), +# and b) transformer output pipe.model = deepspeed.init_inference( pipe.model, mp_size=world_size, dtype=torch.float, - injection_policy={T5Block: ('o', 'wo')} + injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')} ) pipe.device = torch.device(f'cuda:{local_rank}') From 0591ccc052e543bb1dcd24432ace53444bd28743 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 4 Nov 2021 04:53:28 +0500 Subject: [PATCH 3/5] add more tests to try the Tensor-Parallel inference --- inference/huggingface/gpt-neo.py | 7 +++--- inference/huggingface/run_generation.py | 3 ++- inference/huggingface/test-electra.py | 28 ++++++++++++++++++++++++ inference/huggingface/test-roberta.py | 29 +++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 inference/huggingface/test-electra.py create mode 100644 inference/huggingface/test-roberta.py diff --git a/inference/huggingface/gpt-neo.py b/inference/huggingface/gpt-neo.py index 25499c595..614fcc5b2 100644 --- a/inference/huggingface/gpt-neo.py +++ b/inference/huggingface/gpt-neo.py @@ -15,13 +15,14 @@ "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" .format(local_rank, world_size)) - +import pdb;pdb.set_trace() generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B', device=local_rank) generator.model = deepspeed.init_inference(generator.model, mp_size=world_size, dtype=torch.float, - replace_method='auto') + replace_method='auto', + replace_with_kernel_inject=True) string = generator("DeepSpeed is", do_sample=True, min_length=50) -print(string) +print(string) \ No newline at end of file diff --git a/inference/huggingface/run_generation.py b/inference/huggingface/run_generation.py index 0bef0f499..a609bd1c5 100644 --- a/inference/huggingface/run_generation.py +++ b/inference/huggingface/run_generation.py @@ -261,7 +261,8 @@ def main(): model = deepspeed.init_inference(model, mp_size=1, dtype=(torch.half if args.fp16 else torch.float), - injection_policy=injection_policy) + injection_policy=injection_policy, + replace_with_kernel_inject=True) model = model.module args.length = adjust_length_to_model(args.length, max_sequence_length=model.config.max_position_embeddings) diff --git a/inference/huggingface/test-electra.py b/inference/huggingface/test-electra.py new file mode 100644 index 000000000..a193584f9 --- /dev/null +++ b/inference/huggingface/test-electra.py @@ -0,0 +1,28 @@ +from transformers import pipeline +import transformers +import deepspeed +import torch +import os +from transformers.models.electra.modeling_electra import ElectraLayer + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '4')) + +pipe = pipeline('fill-mask', model="google/electra-base-generator", + tokenizer="google/electra-base-generator") + +# The inpjection_policy shows two things: +# 1. which layer module we need to add Tensor-Parallelism +# 2. the name of one or several linear layers: a) attention_output (both encoder and decoder), +# and b) transformer output +pipe.model = deepspeed.init_inference( + pipe.model, + mp_size=world_size, + dtype=torch.float, + injection_policy={ElectraLayer: ('output.dense')} +) +pipe.device = torch.device(f'cuda:{local_rank}') +output = pipe(f"HuggingFace is creating a {pipe.tokenizer.mask_token} that the community uses to solve NLP tasks.") + +if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + print(output) diff --git a/inference/huggingface/test-roberta.py b/inference/huggingface/test-roberta.py new file mode 100644 index 000000000..a66e018a4 --- /dev/null +++ b/inference/huggingface/test-roberta.py @@ -0,0 +1,29 @@ +from transformers import pipeline +import transformers +import deepspeed +import torch +import os +from transformers.models.roberta.modeling_roberta import RobertaLayer + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '4')) + +pipe = pipeline('fill-mask', model="roberta-large", device=local_rank) + +# The inpjection_policy shows two things: +# 1. which layer module we need to add Tensor-Parallelism +# 2. the name of several linear layers: a) attention_output (both encoder and decoder), +# and b) transformer output + +pipe.model = deepspeed.init_inference( + pipe.model, + mp_size=world_size, + dtype=torch.float, + injection_policy={RobertaLayer: ('output.dense')} +) + +pipe.device = torch.device(f'cuda:{local_rank}') +output = pipe("Hello I'm a model.") + +if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + print(output) From 3ddd01154ab0d85904b920f321042bf5dca1a70d Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 4 Nov 2021 04:54:49 +0500 Subject: [PATCH 4/5] remove pdb --- inference/huggingface/gpt-neo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/inference/huggingface/gpt-neo.py b/inference/huggingface/gpt-neo.py index 614fcc5b2..1c0638e5a 100644 --- a/inference/huggingface/gpt-neo.py +++ b/inference/huggingface/gpt-neo.py @@ -15,7 +15,6 @@ "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" .format(local_rank, world_size)) -import pdb;pdb.set_trace() generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B', device=local_rank) From 4380a82552fb31d2ac28ace0eddf9b5f4d4ec13c Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 4 Nov 2021 09:43:36 +0500 Subject: [PATCH 5/5] add tests for GPT-J and wav2vec2 model architectures --- inference/huggingface/gpt-neo.py | 1 + inference/huggingface/test-gptj.py | 36 ++++++++++++++++++ inference/huggingface/test-wav2vec2.py | 51 ++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) create mode 100644 inference/huggingface/test-gptj.py create mode 100644 inference/huggingface/test-wav2vec2.py diff --git a/inference/huggingface/gpt-neo.py b/inference/huggingface/gpt-neo.py index 1c0638e5a..001be9085 100644 --- a/inference/huggingface/gpt-neo.py +++ b/inference/huggingface/gpt-neo.py @@ -22,6 +22,7 @@ mp_size=world_size, dtype=torch.float, replace_method='auto', + #injection_policy={gpt2_transformer: ('attention.out_proj','mlp.c_proj')}, replace_with_kernel_inject=True) string = generator("DeepSpeed is", do_sample=True, min_length=50) print(string) \ No newline at end of file diff --git a/inference/huggingface/test-gptj.py b/inference/huggingface/test-gptj.py new file mode 100644 index 000000000..ab13994ae --- /dev/null +++ b/inference/huggingface/test-gptj.py @@ -0,0 +1,36 @@ +import os +import torch +import deepspeed +import transformers + +from deepspeed import module_inject +from transformers import pipeline +from transformers.models.gptj.modeling_gptj import GPTJBlock + +# Get local gpu rank from torch.distributed/deepspeed launcher +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + +print( + "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" + .format(local_rank, + world_size)) +from transformers import AutoTokenizer, AutoModelForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") +model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") + +inp_tokens = tokenizer("DeepSpeed is", return_tensors="pt",) +model = deepspeed.init_inference(model, + mp_size=world_size, + dtype=torch.float, + injection_policy={GPTJBlock: ('attn.out_proj','mlp.fc_out')}, + replace_with_kernel_inject=False) + +for token in inp_tokens: + if torch.is_tensor(inp_tokens[token]): + inp_tokens[token] = inp_tokens[token].to(f'cuda:{local_rank}') + +model.cuda().to(f'cuda:{local_rank}') +string = tokenizer.batch_decode(model.generate(**inp_tokens,min_length=50,))[0] +print(string) \ No newline at end of file diff --git a/inference/huggingface/test-wav2vec2.py b/inference/huggingface/test-wav2vec2.py new file mode 100644 index 000000000..da7e778af --- /dev/null +++ b/inference/huggingface/test-wav2vec2.py @@ -0,0 +1,51 @@ +from datasets import load_dataset +from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +import soundfile as sf +import torch +from jiwer import wer +import os +import torch +import deepspeed +from deepspeed import module_inject +from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2EncoderLayer + +librispeech_eval = load_dataset("librispeech_asr", "clean", split="test") + +# Get local gpu rank from torch.distributed/deepspeed launcher +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + +print( + "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" + .format(local_rank, + world_size)) + +model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") +processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") + +model = deepspeed.init_inference(model, + mp_size=world_size, + dtype=torch.float, + injection_policy={Wav2Vec2EncoderLayer: ('attention.out_proj','feed_forward.output_dense')}, + replace_with_kernel_inject=False) +model.to(f'cuda:{local_rank}') +def map_to_array(batch): + speech, _ = sf.read(batch["file"]) + batch["speech"] = speech + return batch + +librispeech_eval = librispeech_eval.map(map_to_array) + +def map_to_pred(batch): + input_values = processor(batch["speech"], return_tensors="pt", padding="longest").input_values + with torch.no_grad(): + logits = model(input_values.to(f'cuda:{local_rank}')).logits + + predicted_ids = torch.argmax(logits, dim=-1) + transcription = processor.batch_decode(predicted_ids) + batch["transcription"] = transcription + return batch + +result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["speech"]) + +print("WER:", wer(result["text"], result["transcription"])) \ No newline at end of file