From efa354dd7d3ce01482a4f5512e24cbaa379351fe Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 9 Jul 2022 22:59:39 -0700 Subject: [PATCH 01/55] hardcode the dtype depending on the model --- scripts/inference/bloom-inference.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-inference.py index 17da46795..f348cc893 100644 --- a/scripts/inference/bloom-inference.py +++ b/scripts/inference/bloom-inference.py @@ -68,16 +68,23 @@ def get_checkpoint_files(pretrained_model_name_or_path): tokenizer = AutoTokenizer.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name) + +# XXX: can't automatically derive dtype via config's `from_pretrained` +dtype = torch.bfloat16 if model_name == "bigscience/bloom" else torch.float16 + +#dtype = config.dtype +print(dtype) + model_hidden_size = config.hidden_size train_batch_size = 1 * world_size model = AutoModelForCausalLM.from_config(config) ds_config = { "fp16": { - "enabled": model.dtype == torch.float16, + "enabled": dtype == torch.float16, }, "bf16": { - "enabled": model.dtype == torch.bfloat16, + "enabled": dtype == torch.bfloat16, }, "zero_optimization": { "stage": 3, @@ -97,6 +104,8 @@ def get_checkpoint_files(pretrained_model_name_or_path): "wall_clock_breakdown": False } +print(ds_config) + dschf = HfDeepSpeedConfig(ds_config) model = model.eval() From cafc3f5d3c80da6518f0cdd775677df118329591 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Sun, 10 Jul 2022 12:10:18 +0500 Subject: [PATCH 02/55] change the mp based on the world_size --- scripts/inference/bloom-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-inference.py index f348cc893..a98ec5ee6 100644 --- a/scripts/inference/bloom-inference.py +++ b/scripts/inference/bloom-inference.py @@ -132,7 +132,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): model = deepspeed.init_inference(model, - mp_size=1, + mp_size=world_size, dtype=torch.half, checkpoint=checkpoints_json, #injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')} From daeb293be6caa41a70d6dd18ea4827e066333035 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 10 Jul 2022 07:05:06 -0700 Subject: [PATCH 03/55] remove hardcoded world_size --- scripts/inference/bloom-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-inference.py index f348cc893..a98ec5ee6 100644 --- a/scripts/inference/bloom-inference.py +++ b/scripts/inference/bloom-inference.py @@ -132,7 +132,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): model = deepspeed.init_inference(model, - mp_size=1, + mp_size=world_size, dtype=torch.half, checkpoint=checkpoints_json, #injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')} From 7d5f7d467d82572088068b5e06faf432d49a89a0 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 10 Jul 2022 07:08:30 -0700 Subject: [PATCH 04/55] add bigscience/bigscience-small-testing --- scripts/inference/bloom-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-inference.py index a98ec5ee6..078312d7c 100644 --- a/scripts/inference/bloom-inference.py +++ b/scripts/inference/bloom-inference.py @@ -70,7 +70,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): config = AutoConfig.from_pretrained(model_name) # XXX: can't automatically derive dtype via config's `from_pretrained` -dtype = torch.bfloat16 if model_name == "bigscience/bloom" else torch.float16 +dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 #dtype = config.dtype print(dtype) From 1ff0f698f4cc5a355402049990e7dbdeee27200b Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 10 Jul 2022 09:07:47 -0700 Subject: [PATCH 05/55] fixes --- scripts/inference/bloom-inference.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-inference.py index 078312d7c..c9eb78c8e 100644 --- a/scripts/inference/bloom-inference.py +++ b/scripts/inference/bloom-inference.py @@ -72,6 +72,9 @@ def get_checkpoint_files(pretrained_model_name_or_path): # XXX: can't automatically derive dtype via config's `from_pretrained` dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 +# XXX: for now ds-inference only works with fp16 +dtype = torch.float16 + #dtype = config.dtype print(dtype) @@ -131,11 +134,15 @@ def get_checkpoint_files(pretrained_model_name_or_path): json.dump(data, f) +# use one of these args to `init_inference` +# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work +# 2. replace_with_kernel_inject is the faster one (fast fused kernels) + model = deepspeed.init_inference(model, mp_size=world_size, dtype=torch.half, checkpoint=checkpoints_json, - #injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')} + #injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}, replace_with_kernel_inject=True ) model = model.module From 56b24ed3477904c9887d8bcd097f5894d948f365 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 10 Jul 2022 09:44:25 -0700 Subject: [PATCH 06/55] add zero-inference script --- scripts/inference/bloom-inference-zero.py | 102 ++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 scripts/inference/bloom-inference-zero.py diff --git a/scripts/inference/bloom-inference-zero.py b/scripts/inference/bloom-inference-zero.py new file mode 100644 index 000000000..fd096cc69 --- /dev/null +++ b/scripts/inference/bloom-inference-zero.py @@ -0,0 +1,102 @@ + +# usage: +# +# direct HF +# deepspeed --num_gpus 1 bloom-test.py --name bigscience/bloom-350m +# +# via deepspeed/zero-3 inference +# deepspeed --num_gpus 1 bloom-test.py --name bigscience/bloom-350m --deepspeed +# + + +import torch +import deepspeed +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline +from argparse import ArgumentParser +import os +from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock +from transformers.deepspeed import HfDeepSpeedConfig +import torch.distributed as dist + +parser = ArgumentParser() + +parser.add_argument("--name", required=True, type=str) +parser.add_argument("--local_rank", required=False, type=int) +parser.add_argument("--deepspeed", action="store_true") +args = parser.parse_args() + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + +print( + "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" + .format(local_rank, + world_size)) + +config = AutoConfig.from_pretrained(args.name) + +model_hidden_size = config.hidden_size + +train_batch_size = 1 * world_size +model_name = args.name +model = AutoModelForCausalLM.from_pretrained(model_name) + + +# Note: you need to edit nvme_path to an actual path on your filesystem where the model will be offloaded to +ds_config = { + "fp16": { + "enabled": model.dtype == torch.float16, + }, + "bf16": { + "enabled": model.dtype == torch.bfloat16, + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "nvme", + "nvme_path": "/mnt/nvme0/offload/", + "pin_memory": True, + "buffer_count": 4, + "fast_init": False + }, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": model_hidden_size * model_hidden_size, + "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, + "stage3_param_persistence_threshold": 10 * model_hidden_size + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False +} + +deepspeed.runtime.utils.see_memory_usage('pre-init', force=True) +if args.deepspeed: + dschf = HfDeepSpeedConfig(ds_config) + +#generator = pipeline('text-generation', model=args.name, device=local_rank, framework="pt") +deepspeed.runtime.utils.see_memory_usage('post-init', force=True) + + +if args.deepspeed: + ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] + ds_engine.module.eval() +# generator.model = ds_engine.module + deepspeed.runtime.utils.see_memory_usage('post-ds-init', force=True) +else: + dist.init_process_group("nccl") + model = model.to(device=local_rank) + +#response = generator('DeepSpeed is', min_length=50, max_length=50, do_sample=False) + +text_in = 'DeepSpeed is' + +tokenizer = AutoTokenizer.from_pretrained(model_name) +inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank) +with torch.no_grad(): + model = ds_engine.module if args.deepspeed else model + outputs = model.generate(inputs, synced_gpus=True, min_length=50, max_length=50, do_sample=False) +text_out = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(f"in={text_in}\nout={text_out}") From 67aab37c30524cb008e8dcddb4e9307683919ae2 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 10 Jul 2022 22:15:07 -0700 Subject: [PATCH 07/55] fixes --- scripts/inference/bloom-inference-zero.py | 36 +++++++++++++---------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/scripts/inference/bloom-inference-zero.py b/scripts/inference/bloom-inference-zero.py index fd096cc69..e38b40520 100644 --- a/scripts/inference/bloom-inference-zero.py +++ b/scripts/inference/bloom-inference-zero.py @@ -23,7 +23,7 @@ parser.add_argument("--name", required=True, type=str) parser.add_argument("--local_rank", required=False, type=int) -parser.add_argument("--deepspeed", action="store_true") +#parser.add_argument("--deepspeed", action="store_true") args = parser.parse_args() local_rank = int(os.getenv('LOCAL_RANK', '0')) @@ -40,16 +40,15 @@ train_batch_size = 1 * world_size model_name = args.name -model = AutoModelForCausalLM.from_pretrained(model_name) - +dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 # Note: you need to edit nvme_path to an actual path on your filesystem where the model will be offloaded to ds_config = { "fp16": { - "enabled": model.dtype == torch.float16, + "enabled": dtype == torch.float16, }, "bf16": { - "enabled": model.dtype == torch.bfloat16, + "enabled": dtype == torch.bfloat16, }, "zero_optimization": { "stage": 3, @@ -58,6 +57,7 @@ "nvme_path": "/mnt/nvme0/offload/", "pin_memory": True, "buffer_count": 4, + "buffer_size": 4e9, # for bloom, otherwise the default 1e8 should be enough "fast_init": False }, "overlap_comm": True, @@ -73,21 +73,27 @@ } deepspeed.runtime.utils.see_memory_usage('pre-init', force=True) -if args.deepspeed: - dschf = HfDeepSpeedConfig(ds_config) +#if args.deepspeed: +dschf = HfDeepSpeedConfig(ds_config) + +model = AutoModelForCausalLM.from_pretrained(model_name) #generator = pipeline('text-generation', model=args.name, device=local_rank, framework="pt") deepspeed.runtime.utils.see_memory_usage('post-init', force=True) - -if args.deepspeed: - ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] - ds_engine.module.eval() +ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +ds_engine.module.eval() # generator.model = ds_engine.module - deepspeed.runtime.utils.see_memory_usage('post-ds-init', force=True) -else: - dist.init_process_group("nccl") - model = model.to(device=local_rank) +deepspeed.runtime.utils.see_memory_usage('post-ds-init', force=True) + +# if args.deepspeed: +# ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +# ds_engine.module.eval() +# # generator.model = ds_engine.module +# deepspeed.runtime.utils.see_memory_usage('post-ds-init', force=True) +# else: +# dist.init_process_group("nccl") +# model = model.to(device=local_rank) #response = generator('DeepSpeed is', min_length=50, max_length=50, do_sample=False) From 328ab0cc7d33671fb824dfe04f0f85ab253b87bd Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 11 Jul 2022 07:39:25 -0700 Subject: [PATCH 08/55] fix --- scripts/inference/bloom-inference-zero.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-inference-zero.py b/scripts/inference/bloom-inference-zero.py index e38b40520..af33f984c 100644 --- a/scripts/inference/bloom-inference-zero.py +++ b/scripts/inference/bloom-inference-zero.py @@ -102,7 +102,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank) with torch.no_grad(): - model = ds_engine.module if args.deepspeed else model + #model = ds_engine.module if args.deepspeed else model outputs = model.generate(inputs, synced_gpus=True, min_length=50, max_length=50, do_sample=False) text_out = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"in={text_in}\nout={text_out}") From f2628b03c4d164f72a5b901b8c1a8a0a4bb688b5 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 00:48:00 +0200 Subject: [PATCH 09/55] working script --- scripts/inference/bloom-inference.py | 107 ++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 20 deletions(-) diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-inference.py index c9eb78c8e..3e99522cd 100644 --- a/scripts/inference/bloom-inference.py +++ b/scripts/inference/bloom-inference.py @@ -3,15 +3,17 @@ # deepspeed --num_gpus 1 bloom-inference.py --name bigscience/bloom-350m # -#import glob +import glob from argparse import ArgumentParser from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.deepspeed import HfDeepSpeedConfig from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock import deepspeed import io +import sys import json import os +import gc import torch import torch.distributed as dist @@ -30,6 +32,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): # shards into cache and returning the cached entries - note that I removed most arguments from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url + from transformers.utils.hub import EntryNotFoundError cache_dir = None is_sharded = False @@ -72,15 +75,20 @@ def get_checkpoint_files(pretrained_model_name_or_path): # XXX: can't automatically derive dtype via config's `from_pretrained` dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 -# XXX: for now ds-inference only works with fp16 -dtype = torch.float16 +kernel_inject = True +#kernel_inject = False + +if kernel_inject: + # XXX: for now ds-inference only works with fp16 + dtype = torch.float16 +else: + dtype = torch.bfloat16 #dtype = config.dtype print(dtype) model_hidden_size = config.hidden_size train_batch_size = 1 * world_size -model = AutoModelForCausalLM.from_config(config) ds_config = { "fp16": { @@ -91,10 +99,10 @@ def get_checkpoint_files(pretrained_model_name_or_path): }, "zero_optimization": { "stage": 3, - "offload_param": { - "device": "cpu", - "pin_memory": True - }, +# "offload_param": { +# "device": "none", +# "pin_memory": True +# }, "overlap_comm": True, "contiguous_gradients": True, "reduce_bucket_size": model_hidden_size * model_hidden_size, @@ -111,40 +119,94 @@ def get_checkpoint_files(pretrained_model_name_or_path): dschf = HfDeepSpeedConfig(ds_config) +torch.cuda.empty_cache() +gc.collect() +deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) + +model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16) + +deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) + model = model.eval() + + ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] ds_engine.module.eval() model = ds_engine.module +# a must to remove ZeRO hooks! +ds_engine.destroy() + +from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +def ds_clear_params(ds_engine): + for p in ds_engine.parameters(): + if hasattr(p, "ds_tensor"): + p.ds_tensor = torch.empty(0, dtype=p.dtype, device=p.device) + p.ds_status = ZeroParamStatus.NOT_AVAILABLE + +# this frees the memory used by zero +ds_clear_params(ds_engine) + +#ds_engine.module = None +del ds_engine + +torch.cuda.empty_cache() +gc.collect() +deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True) checkpoints_json = "checkpoints.json" -with io.open(checkpoints_json, 'w', encoding='utf-8') as f: +def write_checkponts_json(): + + with io.open(checkpoints_json, 'w', encoding='utf-8') as f: - #checkpoint_files = glob.glob(f"args.checkpoint_dir/*bin") - checkpoint_files = get_checkpoint_files(model_name) + checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom" + checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") + #checkpoint_files = get_checkpoint_files(model_name) - print("Checkpoint files:", checkpoint_files) + print("Checkpoint files:", checkpoint_files) - data = { - "type": "BLOOM-176B", - "checkpoints": checkpoint_files, - "version": 1.0 - } - json.dump(data, f) + data = { + "type": "BLOOM-176B", + "checkpoints": checkpoint_files, + "version": 1.0 + } + json.dump(data, f) +rank = dist.get_rank() +if rank == 0: + write_checkponts_json() +dist.barrier() + +#print("before deepspeed.init_inference") +torch.cuda.empty_cache() +gc.collect() +deepspeed.runtime.utils.see_memory_usage('pre-ds-inference-init', force=True) # use one of these args to `init_inference` # 1. injection_policy is the slower version, but it's plain pytorch so it'll always work # 2. replace_with_kernel_inject is the faster one (fast fused kernels) +if kernel_inject: + kwargs = dict(replace_with_kernel_inject=True) +else: + kwargs = dict(injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}) + +#checkpoints_json=None model = deepspeed.init_inference(model, mp_size=world_size, dtype=torch.half, checkpoint=checkpoints_json, - #injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}, - replace_with_kernel_inject=True + **kwargs ) +# injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}, +# #replace_with_kernel_inject=True + +torch.cuda.empty_cache() +gc.collect() +deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True) + +#print("after deepspeed.init_inference") model = model.module text_in = 'DeepSpeed is' @@ -167,3 +229,8 @@ def get_checkpoint_files(pretrained_model_name_or_path): text_out = tokenizer.batch_decode(gen_tokens)[0] print(f"in={text_in}\nout={text_out}") + +torch.cuda.empty_cache() +gc.collect() +deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) + From 195288e5f14fccdae9baaa1bf9215966bf2c535a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 00:51:55 +0200 Subject: [PATCH 10/55] renames --- .../inference/{bloom-inference.py => bloom-ds-inference.py} | 2 +- .../{bloom-inference-zero.py => bloom-ds-zero-inference.py} | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) rename scripts/inference/{bloom-inference.py => bloom-ds-inference.py} (98%) rename scripts/inference/{bloom-inference-zero.py => bloom-ds-zero-inference.py} (95%) diff --git a/scripts/inference/bloom-inference.py b/scripts/inference/bloom-ds-inference.py similarity index 98% rename from scripts/inference/bloom-inference.py rename to scripts/inference/bloom-ds-inference.py index 3e99522cd..4fbb511b2 100644 --- a/scripts/inference/bloom-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -1,6 +1,6 @@ # usage: -# deepspeed --num_gpus 1 bloom-inference.py --name bigscience/bloom-350m +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom # import glob diff --git a/scripts/inference/bloom-inference-zero.py b/scripts/inference/bloom-ds-zero-inference.py similarity index 95% rename from scripts/inference/bloom-inference-zero.py rename to scripts/inference/bloom-ds-zero-inference.py index af33f984c..047eb0688 100644 --- a/scripts/inference/bloom-inference-zero.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -1,11 +1,8 @@ # usage: # -# direct HF -# deepspeed --num_gpus 1 bloom-test.py --name bigscience/bloom-350m -# # via deepspeed/zero-3 inference -# deepspeed --num_gpus 1 bloom-test.py --name bigscience/bloom-350m --deepspeed +# deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom # From 3c7b2cb64be55b9a9c906d95f1a502375d4cd007 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 12 Jul 2022 16:53:57 -0700 Subject: [PATCH 11/55] fixes --- scripts/inference/README.md | 16 ++++++++++++++++ scripts/inference/bloom-ds-inference.py | 13 +++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 1a958c28b..8e50329d2 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -1 +1,17 @@ # Inference scripts for BLOOM + +## Deepspeed-Inference + +https://www.deepspeed.ai/tutorials/inference-tutorial/ + +``` +deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom +``` + +## Deepspeed ZeRO-Inference + +https://www.deepspeed.ai/tutorials/zero/ + +``` +deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom +``` diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 4fbb511b2..78f342864 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -27,17 +27,24 @@ local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) + def get_checkpoint_files(pretrained_model_name_or_path): # XXX: I just hacked this one together to automatically handle the fetching of the model file or # shards into cache and returning the cached entries - note that I removed most arguments from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url from transformers.utils.hub import EntryNotFoundError + from transformers.modeling_utils import get_checkpoint_shard_files cache_dir = None is_sharded = False + + revision = None + #revision = "sharded" + filename = WEIGHTS_NAME - archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename) + archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision +) try: resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) @@ -49,6 +56,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): archive_file = hf_bucket_url( pretrained_model_name_or_path, filename=WEIGHTS_INDEX_NAME, + revision=revision, ) resolved_archive_file = cached_path( archive_file, @@ -62,11 +70,13 @@ def get_checkpoint_files(pretrained_model_name_or_path): pretrained_model_name_or_path, resolved_archive_file, cache_dir=cache_dir, + revision=revision ) return resolved_archive_file + model_name = args.name tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -233,4 +243,3 @@ def write_checkponts_json(): torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) - From 6c5c23ba884bd50db831123e5ddcb018b816362d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 03:17:21 +0200 Subject: [PATCH 12/55] fix for offline use --- scripts/inference/bloom-ds-inference.py | 41 ++++++++++++++++--------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 78f342864..dbb653f4d 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -32,7 +32,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): # XXX: I just hacked this one together to automatically handle the fetching of the model file or # shards into cache and returning the cached entries - note that I removed most arguments - from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url + from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url, is_offline_mode from transformers.utils.hub import EntryNotFoundError from transformers.modeling_utils import get_checkpoint_shard_files @@ -42,15 +42,19 @@ def get_checkpoint_files(pretrained_model_name_or_path): revision = None #revision = "sharded" + if is_offline_mode(): + print("Offline mode: forcing local_files_only=True") + local_files_only = True + filename = WEIGHTS_NAME archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision ) try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) return [resolved_archive_file] - except EntryNotFoundError: + except (EntryNotFoundError, FileNotFoundError): if filename == WEIGHTS_NAME: # Maybe the checkpoint is sharded, we try to grab the index name in this case. archive_file = hf_bucket_url( @@ -61,6 +65,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): resolved_archive_file = cached_path( archive_file, cache_dir=cache_dir, + local_files_only=local_files_only, ) is_sharded = True @@ -70,14 +75,16 @@ def get_checkpoint_files(pretrained_model_name_or_path): pretrained_model_name_or_path, resolved_archive_file, cache_dir=cache_dir, - revision=revision + revision=revision ) return resolved_archive_file +model_name = args.name +#print(get_checkpoint_files(model_name)) +#die -model_name = args.name tokenizer = AutoTokenizer.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name) @@ -95,7 +102,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): dtype = torch.bfloat16 #dtype = config.dtype -print(dtype) +#print(dtype) model_hidden_size = config.hidden_size train_batch_size = 1 * world_size @@ -125,7 +132,6 @@ def get_checkpoint_files(pretrained_model_name_or_path): "wall_clock_breakdown": False } -print(ds_config) dschf = HfDeepSpeedConfig(ds_config) @@ -139,6 +145,11 @@ def get_checkpoint_files(pretrained_model_name_or_path): model = model.eval() +rank = dist.get_rank() + +if rank == 0: + print(ds_config) + ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] ds_engine.module.eval() @@ -170,9 +181,9 @@ def write_checkponts_json(): with io.open(checkpoints_json, 'w', encoding='utf-8') as f: - checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom" - checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") - #checkpoint_files = get_checkpoint_files(model_name) + #checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom" + #checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") + checkpoint_files = get_checkpoint_files(model_name) print("Checkpoint files:", checkpoint_files) @@ -183,7 +194,6 @@ def write_checkponts_json(): } json.dump(data, f) -rank = dist.get_rank() if rank == 0: write_checkponts_json() dist.barrier() @@ -219,7 +229,7 @@ def write_checkponts_json(): #print("after deepspeed.init_inference") model = model.module -text_in = 'DeepSpeed is' +text_in = 'DeepSpeed is a machine learning framework' tokens = tokenizer(text_in, return_tensors="pt") @@ -230,15 +240,16 @@ def write_checkponts_json(): with torch.no_grad(): gen_tokens = model.generate( **tokens, - min_length=50, - max_length=50, + min_length=100, + max_length=100, do_sample=False, ) text_out = tokenizer.batch_decode(gen_tokens)[0] -print(f"in={text_in}\nout={text_out}") +if rank == 0: + print(f"in={text_in}\nout={text_out}") torch.cuda.empty_cache() gc.collect() From 6b19227490b893744c6dedec063ce4ad890e855f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 03:47:56 +0200 Subject: [PATCH 13/55] add benchmark --- scripts/inference/bloom-ds-inference.py | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index dbb653f4d..bb143eed6 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -254,3 +254,35 @@ def write_checkponts_json(): torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) + +# benchmark it! +if 1: + print("Now running a benchmark of performance") + + import time + + # warm up + for i in range(5): + gen_tokens = model.generate( + **tokens, + min_length=100, + max_length=100, + do_sample=True, + ) + + torch.cuda.synchronize() + + # benchmark + t0 = time.time() + for i in range(5): + + gen_tokens = model.generate( + **tokens, + min_length=100, + max_length=100, + do_sample=True, + ) + torch.cuda.synchronize() + if torch.distributed.get_rank() == 0: + print(f'token latency: {(time.time() - t0)/500}') + From 10cbb2d46924607ff69ea5a5b96ab3bae509790f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 04:42:31 +0200 Subject: [PATCH 14/55] add benchmark --- scripts/inference/bloom-ds-inference.py | 63 ++++++++++++++++--------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index bb143eed6..4442a6a58 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -16,12 +16,17 @@ import gc import torch import torch.distributed as dist +import time + +t_start = time.time() + +num_tokens = 100 parser = ArgumentParser() parser.add_argument("--name", required=True, type=str) parser.add_argument("--local_rank", required=False, type=int) -parser.add_argument("--deepspeed", action="store_true") +parser.add_argument("--benchmark", action="store_true") args = parser.parse_args() local_rank = int(os.getenv('LOCAL_RANK', '0')) @@ -39,16 +44,18 @@ def get_checkpoint_files(pretrained_model_name_or_path): cache_dir = None is_sharded = False + # XXX: preparation for revision branches if needed revision = None #revision = "sharded" + # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with + # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')" if is_offline_mode(): print("Offline mode: forcing local_files_only=True") local_files_only = True filename = WEIGHTS_NAME - archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision -) + archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision) try: resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) @@ -83,8 +90,9 @@ def get_checkpoint_files(pretrained_model_name_or_path): model_name = args.name #print(get_checkpoint_files(model_name)) -#die +if local_rank == 0: + print(f"*** Loading the model {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) config = AutoConfig.from_pretrained(model_name) @@ -226,9 +234,14 @@ def write_checkponts_json(): gc.collect() deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True) -#print("after deepspeed.init_inference") +if rank == 0: + print(f"*** Starting to generate {num_tokens}") + model = model.module +if args.benchmark: + t_ready = time.time() + text_in = 'DeepSpeed is a machine learning framework' tokens = tokenizer(text_in, return_tensors="pt") @@ -240,8 +253,8 @@ def write_checkponts_json(): with torch.no_grad(): gen_tokens = model.generate( **tokens, - min_length=100, - max_length=100, + min_length=num_tokens, + max_length=num_tokens, do_sample=False, ) @@ -251,22 +264,24 @@ def write_checkponts_json(): if rank == 0: print(f"in={text_in}\nout={text_out}") +if args.benchmark: + t_finish = time.time() + torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) # benchmark it! -if 1: - print("Now running a benchmark of performance") - - import time +if args.benchmark: + if rank == 0: + print(f"*** Running benchmark") # warm up - for i in range(5): + for i in range(1): gen_tokens = model.generate( **tokens, - min_length=100, - max_length=100, + min_length=num_tokens, + max_length=num_tokens, do_sample=True, ) @@ -274,15 +289,21 @@ def write_checkponts_json(): # benchmark t0 = time.time() - for i in range(5): - + cycles = 5 + for i in range(cycles): gen_tokens = model.generate( **tokens, - min_length=100, - max_length=100, + min_length=num_tokens, + max_length=num_tokens, do_sample=True, ) torch.cuda.synchronize() - if torch.distributed.get_rank() == 0: - print(f'token latency: {(time.time() - t0)/500}') - + if rank == 0: + througput = (time.time() - t0)/(cycles*num_tokens) + print(f""" +*** Performance stats: +Start to ready to generate {t_ready - t_start:.3f} secs +Generate {num_tokens} tokens: {t_finish - t_ready:.3f} secs +Start to finish {t_finish - t_start:.3f} secs +Througput per token: {througput:.4f} secs +""") From 494c212e2d692ac332bf5d7f1bd04e161512740f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 12 Jul 2022 19:44:54 -0700 Subject: [PATCH 15/55] update --- scripts/inference/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 8e50329d2..f6b365107 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -8,6 +8,24 @@ https://www.deepspeed.ai/tutorials/inference-tutorial/ deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom ``` +Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay): + +- Memory per GPU: 50GB +- Peak CPU memory: ~10GB per process, but much more while loading (will improve) + + +``` +deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --benchmark + +*** Performance stats: +Start to ready to generate 698.697 secs +Generate 100 tokens: 23.008 secs +Start to finish 721.705 secs +Througput per token: 0.0412 secs +``` + + + ## Deepspeed ZeRO-Inference https://www.deepspeed.ai/tutorials/zero/ From 2b67c0d9b538cef30b7a68d886e2110bdf4a20b6 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 12 Jul 2022 19:51:16 -0700 Subject: [PATCH 16/55] cleanup --- scripts/inference/README.md | 8 +++++--- scripts/inference/bloom-ds-inference.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index f6b365107..c6cf2c145 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -10,20 +10,22 @@ deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay): -- Memory per GPU: 50GB -- Peak CPU memory: ~10GB per process, but much more while loading (will improve) ``` deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --benchmark *** Performance stats: -Start to ready to generate 698.697 secs +Start to ready to generate: 698.697 secs Generate 100 tokens: 23.008 secs Start to finish 721.705 secs Througput per token: 0.0412 secs ``` +While processing memory per process: + +- GPU: ~50GB +- CPU: ~10GB ## Deepspeed ZeRO-Inference diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 4442a6a58..2b03c0937 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -302,8 +302,8 @@ def write_checkponts_json(): througput = (time.time() - t0)/(cycles*num_tokens) print(f""" *** Performance stats: -Start to ready to generate {t_ready - t_start:.3f} secs +Start to ready to generate: {t_ready - t_start:.3f} secs Generate {num_tokens} tokens: {t_finish - t_ready:.3f} secs Start to finish {t_finish - t_start:.3f} secs -Througput per token: {througput:.4f} secs +Throughput per token: {througput:.4f} secs """) From 3853724e57b637c0339307bbd0792515541b60d1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 12 Jul 2022 19:57:17 -0700 Subject: [PATCH 17/55] update --- scripts/inference/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index c6cf2c145..22610595b 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -2,15 +2,16 @@ ## Deepspeed-Inference +Tensor-Parallelism and efficient fused CUDA kernels: https://www.deepspeed.ai/tutorials/inference-tutorial/ ``` deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom ``` -Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay): - +Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay) - just a batch of 1 (would be more efficient to run a larger batch) +Adding `--benchmark` to activate the benchmarks ``` deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --benchmark From 18967399e1898c148ec0c2be55fe9463eb36e878 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 12 Jul 2022 21:55:06 -0700 Subject: [PATCH 18/55] msecs --- scripts/inference/bloom-ds-inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 2b03c0937..1a53f7fa8 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -302,8 +302,8 @@ def write_checkponts_json(): througput = (time.time() - t0)/(cycles*num_tokens) print(f""" *** Performance stats: +Throughput per token: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs Generate {num_tokens} tokens: {t_finish - t_ready:.3f} secs -Start to finish {t_finish - t_start:.3f} secs -Throughput per token: {througput:.4f} secs +Start to finish: {t_finish - t_start:.3f} secs """) From 7c9daaf1ec5364ebb1de391f0420be7668b74ae1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 12 Jul 2022 22:08:41 -0700 Subject: [PATCH 19/55] cleanup --- scripts/inference/bloom-ds-inference.py | 94 +++++++++++++++---------- 1 file changed, 55 insertions(+), 39 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 1a53f7fa8..61096e4cd 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -1,7 +1,19 @@ - # usage: # deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom # +# to run benchmarks: +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark +# + + +# This is going to improve, but at the moment, the process is a bit cumbersome - we first use +# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, +# 2. free the allocated storage +# 3. start Deepspeed-Inference and only now load the checkpoint +# 4. run generate +# Done. +# + import glob from argparse import ArgumentParser @@ -24,15 +36,17 @@ parser = ArgumentParser() -parser.add_argument("--name", required=True, type=str) -parser.add_argument("--local_rank", required=False, type=int) -parser.add_argument("--benchmark", action="store_true") +parser.add_argument("--name", required=True, type=str, help="model_name") +parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") +parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") args = parser.parse_args() local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) +### Model loading and instantiating on GPU (via ZeRO) + def get_checkpoint_files(pretrained_model_name_or_path): # XXX: I just hacked this one together to automatically handle the fetching of the model file or # shards into cache and returning the cached entries - note that I removed most arguments @@ -98,8 +112,12 @@ def get_checkpoint_files(pretrained_model_name_or_path): config = AutoConfig.from_pretrained(model_name) # XXX: can't automatically derive dtype via config's `from_pretrained` -dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 +#dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 + +# use one of these args to `init_inference` +# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work +# 2. replace_with_kernel_inject is the faster one (fast fused kernels) kernel_inject = True #kernel_inject = False @@ -124,10 +142,6 @@ def get_checkpoint_files(pretrained_model_name_or_path): }, "zero_optimization": { "stage": 3, -# "offload_param": { -# "device": "none", -# "pin_memory": True -# }, "overlap_comm": True, "contiguous_gradients": True, "reduce_bucket_size": model_hidden_size * model_hidden_size, @@ -143,13 +157,15 @@ def get_checkpoint_files(pretrained_model_name_or_path): dschf = HfDeepSpeedConfig(ds_config) -torch.cuda.empty_cache() -gc.collect() -deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16) -deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) +if args.benchmark: + deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) model = model.eval() @@ -158,14 +174,16 @@ def get_checkpoint_files(pretrained_model_name_or_path): if rank == 0: print(ds_config) - ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] ds_engine.module.eval() model = ds_engine.module -# a must to remove ZeRO hooks! +### Deepspeed-ZeRO Unloading + +# a must to remove ZeRO-installed hooks! ds_engine.destroy() +# free GPU storage used by ZeRO from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus def ds_clear_params(ds_engine): for p in ds_engine.parameters(): @@ -173,16 +191,13 @@ def ds_clear_params(ds_engine): p.ds_tensor = torch.empty(0, dtype=p.dtype, device=p.device) p.ds_status = ZeroParamStatus.NOT_AVAILABLE -# this frees the memory used by zero ds_clear_params(ds_engine) - -#ds_engine.module = None del ds_engine -torch.cuda.empty_cache() -gc.collect() -deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True) - +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True) checkpoints_json = "checkpoints.json" def write_checkponts_json(): @@ -206,14 +221,10 @@ def write_checkponts_json(): write_checkponts_json() dist.barrier() -#print("before deepspeed.init_inference") -torch.cuda.empty_cache() -gc.collect() -deepspeed.runtime.utils.see_memory_usage('pre-ds-inference-init', force=True) - -# use one of these args to `init_inference` -# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work -# 2. replace_with_kernel_inject is the faster one (fast fused kernels) +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('pre-ds-inference-init', force=True) if kernel_inject: kwargs = dict(replace_with_kernel_inject=True) @@ -225,14 +236,13 @@ def write_checkponts_json(): mp_size=world_size, dtype=torch.half, checkpoint=checkpoints_json, - **kwargs + **kwargs, ) -# injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}, -# #replace_with_kernel_inject=True -torch.cuda.empty_cache() -gc.collect() -deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True) +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True) if rank == 0: print(f"*** Starting to generate {num_tokens}") @@ -242,6 +252,9 @@ def write_checkponts_json(): if args.benchmark: t_ready = time.time() + +### Generate + text_in = 'DeepSpeed is a machine learning framework' tokens = tokenizer(text_in, return_tensors="pt") @@ -267,9 +280,12 @@ def write_checkponts_json(): if args.benchmark: t_finish = time.time() -torch.cuda.empty_cache() -gc.collect() -deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) + +### Benchmark # benchmark it! if args.benchmark: From dca2c8f778e91463036cbd3643ad40625f2ecf27 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 12 Jul 2022 22:40:59 -0700 Subject: [PATCH 20/55] improve --- scripts/inference/bloom-ds-inference.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 61096e4cd..03a76f9c9 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -244,8 +244,6 @@ def write_checkponts_json(): gc.collect() deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True) -if rank == 0: - print(f"*** Starting to generate {num_tokens}") model = model.module @@ -255,6 +253,9 @@ def write_checkponts_json(): ### Generate +if rank == 0: + print(f"*** Starting to generate {num_tokens} tokens") + text_in = 'DeepSpeed is a machine learning framework' tokens = tokenizer(text_in, return_tensors="pt") From 85580c0b721c5cb418cac676e6a4f40b6eae8978 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 20:00:32 +0200 Subject: [PATCH 21/55] fix benchmark, add warmup --- scripts/inference/bloom-ds-inference.py | 53 ++++++++++--------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 03a76f9c9..79207f605 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -256,24 +256,24 @@ def write_checkponts_json(): if rank == 0: print(f"*** Starting to generate {num_tokens} tokens") -text_in = 'DeepSpeed is a machine learning framework' +def generate(): + text_in = 'DeepSpeed is a machine learning framework' + tokens = tokenizer(text_in, return_tensors="pt") + for t in tokens: + if torch.is_tensor(tokens[t]): + tokens[t] = tokens[t].to(torch.cuda.current_device()) + gen_tokens = model.generate(**tokens, min_length=num_tokens, max_length=num_tokens, do_sample=False) + text_out = tokenizer.batch_decode(gen_tokens)[0] + return text_in, text_out + +# warmup +text_in, text_out = generate() -tokens = tokenizer(text_in, return_tensors="pt") - -for t in tokens: - if torch.is_tensor(tokens[t]): - tokens[t] = tokens[t].to(torch.cuda.current_device()) - -with torch.no_grad(): - gen_tokens = model.generate( - **tokens, - min_length=num_tokens, - max_length=num_tokens, - do_sample=False, - ) - - -text_out = tokenizer.batch_decode(gen_tokens)[0] +if args.benchmark: + # make sure one generate is run earlier as a warmup + t_generate_start = time.time() + text_in, text_out = generate() + t_generate_span = time.time() - t_generate_start if rank == 0: print(f"in={text_in}\nout={text_out}") @@ -295,25 +295,14 @@ def write_checkponts_json(): # warm up for i in range(1): - gen_tokens = model.generate( - **tokens, - min_length=num_tokens, - max_length=num_tokens, - do_sample=True, - ) - + generate() torch.cuda.synchronize() # benchmark t0 = time.time() cycles = 5 for i in range(cycles): - gen_tokens = model.generate( - **tokens, - min_length=num_tokens, - max_length=num_tokens, - do_sample=True, - ) + generate() torch.cuda.synchronize() if rank == 0: througput = (time.time() - t0)/(cycles*num_tokens) @@ -321,6 +310,6 @@ def write_checkponts_json(): *** Performance stats: Throughput per token: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Generate {num_tokens} tokens: {t_finish - t_ready:.3f} secs -Start to finish: {t_finish - t_start:.3f} secs +Tokenize and generate {num_tokens} tokens: {t_generate_span:.3f} secs +Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) From 5ea3dee9fe86ad4ca3ec7c4e65263f8822725b29 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 11:05:20 -0700 Subject: [PATCH 22/55] update --- scripts/inference/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 22610595b..b8b80fc83 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -17,10 +17,10 @@ Adding `--benchmark` to activate the benchmarks deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --benchmark *** Performance stats: -Start to ready to generate: 698.697 secs -Generate 100 tokens: 23.008 secs -Start to finish 721.705 secs -Througput per token: 0.0412 secs +Throughput per token: 40.73 msecs +Start to ready to generate: 673.429 secs +Tokenize and generate 100 tokens: 4.089 secs +Start to finish: 677.518 secs ``` While processing memory per process: From 737c68168e03de826bc40c77e95b0afd2501be4f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 11:56:13 -0700 Subject: [PATCH 23/55] fix; thanks Michael Wyatt --- scripts/inference/bloom-ds-inference.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 79207f605..aba06a3fd 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -67,6 +67,8 @@ def get_checkpoint_files(pretrained_model_name_or_path): if is_offline_mode(): print("Offline mode: forcing local_files_only=True") local_files_only = True + else: + local_files_only = False filename = WEIGHTS_NAME archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision) From 6be0cca48c3057d3ea0d4fc26f4e25f43be9df1d Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 11:58:28 -0700 Subject: [PATCH 24/55] clarify --- scripts/inference/bloom-ds-inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index aba06a3fd..e1dcc18ec 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -268,7 +268,8 @@ def generate(): text_out = tokenizer.batch_decode(gen_tokens)[0] return text_in, text_out -# warmup +# warmup is a must if measuring speed as it's when all the optimizations are performed +# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs text_in, text_out = generate() if args.benchmark: From fc9b458a910432d3a8aa85ab7a23ac45574d8936 Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 14 Jul 2022 03:34:08 +0500 Subject: [PATCH 25/55] add bloom batch-inference script --- scripts/inference/bloom-batch-inference.py | 177 +++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 scripts/inference/bloom-batch-inference.py diff --git a/scripts/inference/bloom-batch-inference.py b/scripts/inference/bloom-batch-inference.py new file mode 100644 index 000000000..c15a53281 --- /dev/null +++ b/scripts/inference/bloom-batch-inference.py @@ -0,0 +1,177 @@ +import torch +import deepspeed +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig +from argparse import ArgumentParser +import os +from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock +from transformers.models.bloom.modeling_bloom import BloomPreTrainedModel +from transformers.deepspeed import HfDeepSpeedConfig +from transformers import BloomTokenizerFast + + +def get_checkpoint_files(pretrained_model_name_or_path): + # XXX: I just hacked this one together to automatically handle the fetching of the model file or + # shards into cache and returning the cached entries - note that I removed most arguments + + from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url, is_offline_mode + from transformers.utils.hub import EntryNotFoundError + from transformers.modeling_utils import get_checkpoint_shard_files + + cache_dir = None + is_sharded = False + + # XXX: preparation for revision branches if needed + revision = None + #revision = "sharded" + + # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with + # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')" + if is_offline_mode(): + print("Offline mode: forcing local_files_only=True") + local_files_only = True + else: + local_files_only = False + + filename = WEIGHTS_NAME + archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision) + + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) + return [resolved_archive_file] + + except (EntryNotFoundError, FileNotFoundError): + if filename == WEIGHTS_NAME: + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + archive_file = hf_bucket_url( + pretrained_model_name_or_path, + filename=WEIGHTS_INDEX_NAME, + revision=revision, + ) + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + local_files_only=local_files_only, + ) + is_sharded = True + + if is_sharded: + # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + cache_dir=cache_dir, + revision=revision + ) + + return resolved_archive_file + +parser = ArgumentParser() +parser.add_argument("--name", required=True, type=str) +parser.add_argument("--local_rank", required=False, type=int) +args = parser.parse_args() +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + +print( + "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" + .format(local_rank, + world_size)) +tokenizer = BloomTokenizerFast.from_pretrained(args.name) +config = AutoConfig.from_pretrained(args.name) +model_hidden_size = config.hidden_size +train_batch_size = 1 * world_size +ds_config = { + "fp16": { + "enabled": True + }, + "bf16": { + "enabled": False + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "cpu", + "pin_memory": True + }, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": model_hidden_size * model_hidden_size, + "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, + "stage3_param_persistence_threshold": 0 + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False +} +dschf = HfDeepSpeedConfig(ds_config) +model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16).eval() + +from deepspeed.runtime.utils import see_memory_usage +see_memory_usage("after model load ", force=True) +ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +see_memory_usage("after zero-init ", force=True) +ds_engine.module.eval() # inference +model = ds_engine.module + +ds_engine.destroy() +from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus +def ds_clear_params(ds_engine): + for name, p in ds_engine.named_parameters(): + if hasattr(p, "ds_tensor"): + p.ds_tensor = torch.empty(0, dtype=p.dtype, device=p.device) + p.ds_status = ZeroParamStatus.NOT_AVAILABLE +# this frees the memory used by zero +ds_clear_params(ds_engine) +del ds_engine + +checkpoints_json = "checkpoints.json" +def write_checkponts_json(): + model_name = args.name + with io.open(checkpoints_json, 'w', encoding='utf-8') as f: + checkpoint_files = get_checkpoint_files(model_name) + + print("Checkpoint files:", checkpoint_files) + + data = { + "type": "BLOOM-176B", + "checkpoints": checkpoint_files, + "version": 1.0 + } + json.dump(data, f) + +model = deepspeed.init_inference(model, + mp_size=world_size, + dtype=torch.half, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True, + ) +model = model.module +input_sentence = ["DeepSpeed is", + "Reza is working on", + "Jeff has a", + "Stas got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way"] + +print("inference-engine created \n") + +tokenizer = BloomTokenizerFast.from_pretrained(args.name, padding_side="left") +tokens = tokenizer.batch_encode_plus(input_sentence, return_tensors="pt", padding=True) + +for t in tokens: + if torch.is_tensor(tokens[t]): + tokens[t] = tokens[t].to(torch.cuda.current_device()) + +greedy_output = model.generate( + **tokens, max_length=100, do_sample=True +) + +for i in range(len(greedy_output)): + out = tokenizer.decode(greedy_output[i], skip_special_tokens=True) + if torch.distributed.get_rank() == 0: + print(out) + print \ No newline at end of file From 7b0edef2848082c5361c38883501005e72c266ab Mon Sep 17 00:00:00 2001 From: Reza Yazdani Date: Thu, 14 Jul 2022 03:38:03 +0500 Subject: [PATCH 26/55] removed the names :-) --- scripts/inference/bloom-batch-inference.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/inference/bloom-batch-inference.py b/scripts/inference/bloom-batch-inference.py index c15a53281..9b8d54e67 100644 --- a/scripts/inference/bloom-batch-inference.py +++ b/scripts/inference/bloom-batch-inference.py @@ -149,9 +149,9 @@ def write_checkponts_json(): ) model = model.module input_sentence = ["DeepSpeed is", - "Reza is working on", - "Jeff has a", - "Stas got all", + "He is working on", + "He has a", + "He got all", "Everyone is happy and I can", "The new movie that got Oscar this year", "In the far far distance from our galaxy,", From 2120dd2ae4a076931f35ddb3c9de7f5f21bb0786 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 16:28:09 -0700 Subject: [PATCH 27/55] fold the bs functionality from the other script --- scripts/inference/bloom-batch-inference.py | 177 --------------------- scripts/inference/bloom-ds-inference.py | 52 ++++-- 2 files changed, 35 insertions(+), 194 deletions(-) delete mode 100644 scripts/inference/bloom-batch-inference.py diff --git a/scripts/inference/bloom-batch-inference.py b/scripts/inference/bloom-batch-inference.py deleted file mode 100644 index 9b8d54e67..000000000 --- a/scripts/inference/bloom-batch-inference.py +++ /dev/null @@ -1,177 +0,0 @@ -import torch -import deepspeed -import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig -from argparse import ArgumentParser -import os -from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock -from transformers.models.bloom.modeling_bloom import BloomPreTrainedModel -from transformers.deepspeed import HfDeepSpeedConfig -from transformers import BloomTokenizerFast - - -def get_checkpoint_files(pretrained_model_name_or_path): - # XXX: I just hacked this one together to automatically handle the fetching of the model file or - # shards into cache and returning the cached entries - note that I removed most arguments - - from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url, is_offline_mode - from transformers.utils.hub import EntryNotFoundError - from transformers.modeling_utils import get_checkpoint_shard_files - - cache_dir = None - is_sharded = False - - # XXX: preparation for revision branches if needed - revision = None - #revision = "sharded" - - # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with - # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')" - if is_offline_mode(): - print("Offline mode: forcing local_files_only=True") - local_files_only = True - else: - local_files_only = False - - filename = WEIGHTS_NAME - archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision) - - try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) - return [resolved_archive_file] - - except (EntryNotFoundError, FileNotFoundError): - if filename == WEIGHTS_NAME: - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - archive_file = hf_bucket_url( - pretrained_model_name_or_path, - filename=WEIGHTS_INDEX_NAME, - revision=revision, - ) - resolved_archive_file = cached_path( - archive_file, - cache_dir=cache_dir, - local_files_only=local_files_only, - ) - is_sharded = True - - if is_sharded: - # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. - resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( - pretrained_model_name_or_path, - resolved_archive_file, - cache_dir=cache_dir, - revision=revision - ) - - return resolved_archive_file - -parser = ArgumentParser() -parser.add_argument("--name", required=True, type=str) -parser.add_argument("--local_rank", required=False, type=int) -args = parser.parse_args() -local_rank = int(os.getenv('LOCAL_RANK', '0')) -world_size = int(os.getenv('WORLD_SIZE', '1')) - -print( - "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" - .format(local_rank, - world_size)) -tokenizer = BloomTokenizerFast.from_pretrained(args.name) -config = AutoConfig.from_pretrained(args.name) -model_hidden_size = config.hidden_size -train_batch_size = 1 * world_size -ds_config = { - "fp16": { - "enabled": True - }, - "bf16": { - "enabled": False - }, - "zero_optimization": { - "stage": 3, - "offload_param": { - "device": "cpu", - "pin_memory": True - }, - "overlap_comm": True, - "contiguous_gradients": True, - "reduce_bucket_size": model_hidden_size * model_hidden_size, - "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, - "stage3_param_persistence_threshold": 0 - }, - "steps_per_print": 2000, - "train_batch_size": train_batch_size, - "train_micro_batch_size_per_gpu": 1, - "wall_clock_breakdown": False -} -dschf = HfDeepSpeedConfig(ds_config) -model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16).eval() - -from deepspeed.runtime.utils import see_memory_usage -see_memory_usage("after model load ", force=True) -ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] -see_memory_usage("after zero-init ", force=True) -ds_engine.module.eval() # inference -model = ds_engine.module - -ds_engine.destroy() -from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus -def ds_clear_params(ds_engine): - for name, p in ds_engine.named_parameters(): - if hasattr(p, "ds_tensor"): - p.ds_tensor = torch.empty(0, dtype=p.dtype, device=p.device) - p.ds_status = ZeroParamStatus.NOT_AVAILABLE -# this frees the memory used by zero -ds_clear_params(ds_engine) -del ds_engine - -checkpoints_json = "checkpoints.json" -def write_checkponts_json(): - model_name = args.name - with io.open(checkpoints_json, 'w', encoding='utf-8') as f: - checkpoint_files = get_checkpoint_files(model_name) - - print("Checkpoint files:", checkpoint_files) - - data = { - "type": "BLOOM-176B", - "checkpoints": checkpoint_files, - "version": 1.0 - } - json.dump(data, f) - -model = deepspeed.init_inference(model, - mp_size=world_size, - dtype=torch.half, - checkpoint=checkpoints_json, - replace_with_kernel_inject=True, - ) -model = model.module -input_sentence = ["DeepSpeed is", - "He is working on", - "He has a", - "He got all", - "Everyone is happy and I can", - "The new movie that got Oscar this year", - "In the far far distance from our galaxy,", - "Peace is the only way"] - -print("inference-engine created \n") - -tokenizer = BloomTokenizerFast.from_pretrained(args.name, padding_side="left") -tokens = tokenizer.batch_encode_plus(input_sentence, return_tensors="pt", padding=True) - -for t in tokens: - if torch.is_tensor(tokens[t]): - tokens[t] = tokens[t].to(torch.cuda.current_device()) - -greedy_output = model.generate( - **tokens, max_length=100, do_sample=True -) - -for i in range(len(greedy_output)): - out = tokenizer.decode(greedy_output[i], skip_special_tokens=True) - if torch.distributed.get_rank() == 0: - print(out) - print \ No newline at end of file diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index e1dcc18ec..d0e418bcb 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -38,6 +38,7 @@ parser.add_argument("--name", required=True, type=str, help="model_name") parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") +parser.add_argument("--batch_size", default=1, type=int, help="batch size") parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") args = parser.parse_args() @@ -256,34 +257,51 @@ def write_checkponts_json(): ### Generate if rank == 0: - print(f"*** Starting to generate {num_tokens} tokens") - + print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") + +input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way" +] + +if args.batch_size > len(input_sentences): + raise ValueError(f"--batch_size should be <= {len(input_sentences)}") + +inputs = input_sentences[:args.batch_size] def generate(): - text_in = 'DeepSpeed is a machine learning framework' - tokens = tokenizer(text_in, return_tensors="pt") + """ returns a list of pairs of inputs and outputs """ + + tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in tokens: if torch.is_tensor(tokens[t]): tokens[t] = tokens[t].to(torch.cuda.current_device()) - gen_tokens = model.generate(**tokens, min_length=num_tokens, max_length=num_tokens, do_sample=False) - text_out = tokenizer.batch_decode(gen_tokens)[0] - return text_in, text_out + + greedy_output = model.generate(**tokens, min_length=num_tokens, max_length=num_tokens, do_sample=True) + + outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) + + return zip(inputs, outputs) + # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs -text_in, text_out = generate() +pairs = generate() +for i,o in pairs: + print(f"{'-'*60}\nin={i}\nout={o}\n") if args.benchmark: # make sure one generate is run earlier as a warmup t_generate_start = time.time() - text_in, text_out = generate() + _ = generate() t_generate_span = time.time() - t_generate_start -if rank == 0: - print(f"in={text_in}\nout={text_out}") - -if args.benchmark: - t_finish = time.time() - if args.benchmark: torch.cuda.empty_cache() gc.collect() @@ -298,14 +316,14 @@ def generate(): # warm up for i in range(1): - generate() + _ = generate() torch.cuda.synchronize() # benchmark t0 = time.time() cycles = 5 for i in range(cycles): - generate() + _ = generate() torch.cuda.synchronize() if rank == 0: througput = (time.time() - t0)/(cycles*num_tokens) From 78bcbb7ede02232e24d44b66b521770ffa3ba7f3 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 16:30:29 -0700 Subject: [PATCH 28/55] fix --- scripts/inference/bloom-ds-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index d0e418bcb..1fc857435 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -329,7 +329,7 @@ def generate(): througput = (time.time() - t0)/(cycles*num_tokens) print(f""" *** Performance stats: -Throughput per token: {througput*1000:.2f} msecs +Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs Tokenize and generate {num_tokens} tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs From e7468cd2a9784cf1ff2848144c4198af4d79e979 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 16:40:21 -0700 Subject: [PATCH 29/55] restore do_sample --- scripts/inference/bloom-ds-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 1fc857435..4a9cf8fd4 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -283,7 +283,7 @@ def generate(): if torch.is_tensor(tokens[t]): tokens[t] = tokens[t].to(torch.cuda.current_device()) - greedy_output = model.generate(**tokens, min_length=num_tokens, max_length=num_tokens, do_sample=True) + greedy_output = model.generate(**tokens, min_length=num_tokens, max_length=num_tokens, do_sample=False) outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) From 68f5ca6a1d997b3a320c79dc2e3b68d1d5d579b3 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 16:56:55 -0700 Subject: [PATCH 30/55] dump generate args --- scripts/inference/bloom-ds-inference.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 4a9cf8fd4..5eadc25a5 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -273,6 +273,9 @@ def write_checkponts_json(): if args.batch_size > len(input_sentences): raise ValueError(f"--batch_size should be <= {len(input_sentences)}") +generate_kwargs = dict(in_length=num_tokens, max_length=num_tokens, do_sample=False) +if rank == 0: + print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] def generate(): """ returns a list of pairs of inputs and outputs """ @@ -283,7 +286,7 @@ def generate(): if torch.is_tensor(tokens[t]): tokens[t] = tokens[t].to(torch.cuda.current_device()) - greedy_output = model.generate(**tokens, min_length=num_tokens, max_length=num_tokens, do_sample=False) + greedy_output = model.generate(**tokens, **generate_kwargs) outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) From 1eca7c50656d5c71a70f99ef5c327ee8957b98fe Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 21:09:26 -0700 Subject: [PATCH 31/55] fix --- scripts/inference/bloom-ds-inference.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 5eadc25a5..1af4113bb 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -296,8 +296,9 @@ def generate(): # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs pairs = generate() -for i,o in pairs: - print(f"{'-'*60}\nin={i}\nout={o}\n") +if rank == 0: + for i,o in pairs: + print(f"{'-'*60}\nin={i}\nout={o}\n") if args.benchmark: # make sure one generate is run earlier as a warmup From 8815fc3df4e045e3c568b686361649fca2da0884 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 13 Jul 2022 21:19:00 -0700 Subject: [PATCH 32/55] fix --- scripts/inference/bloom-ds-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 1af4113bb..28eb3e56e 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -273,7 +273,7 @@ def write_checkponts_json(): if args.batch_size > len(input_sentences): raise ValueError(f"--batch_size should be <= {len(input_sentences)}") -generate_kwargs = dict(in_length=num_tokens, max_length=num_tokens, do_sample=False) +generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) if rank == 0: print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] From 034cc6fed0d3343750ace12e0f8bb79d20dfed67 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 08:06:55 +0200 Subject: [PATCH 33/55] support any batchsize --- scripts/inference/bloom-ds-inference.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 28eb3e56e..3ef4843ea 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -22,6 +22,7 @@ from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock import deepspeed import io +import math import sys import json import os @@ -271,9 +272,11 @@ def write_checkponts_json(): ] if args.batch_size > len(input_sentences): - raise ValueError(f"--batch_size should be <= {len(input_sentences)}") + # dynamically extend to support larger bs by repetition + input_sentences *= math.ceil(args.batch_size / len(input_sentences)) -generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) +#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) +generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) if rank == 0: print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] From 155c3c3218313291064aee9d7648cc70c65bc70c Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 08:32:33 +0200 Subject: [PATCH 34/55] div by bs --- scripts/inference/bloom-ds-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 3ef4843ea..5a6b6c665 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -333,7 +333,7 @@ def generate(): _ = generate() torch.cuda.synchronize() if rank == 0: - througput = (time.time() - t0)/(cycles*num_tokens) + througput = (time.time() - t0)/(cycles*num_tokens*args.batch_size) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs From 73a8b7b72978153f44540338aa56666f4e8bcedd Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 08:34:43 +0200 Subject: [PATCH 35/55] mul by bs --- scripts/inference/bloom-ds-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 5a6b6c665..4785565ae 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -338,6 +338,6 @@ def generate(): *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {num_tokens} tokens: {t_generate_span:.3f} secs +Tokenize and generate {num_tokens*args.batch_size} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) From 09d74088d602e95dcb5f81a048b03f5a0a8a3479 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 13:49:57 -0700 Subject: [PATCH 36/55] add cpu_offload; sync scripts --- scripts/inference/bloom-ds-inference.py | 5 +- scripts/inference/bloom-ds-zero-inference.py | 212 ++++++++++++++----- 2 files changed, 159 insertions(+), 58 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 4785565ae..b7e212838 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -41,6 +41,7 @@ parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") parser.add_argument("--batch_size", default=1, type=int, help="batch size") parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") +parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload") args = parser.parse_args() local_rank = int(os.getenv('LOCAL_RANK', '0')) @@ -158,8 +159,10 @@ def get_checkpoint_files(pretrained_model_name_or_path): "wall_clock_breakdown": False } +if args.cpu_offload: + ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True) -dschf = HfDeepSpeedConfig(ds_config) +dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus if args.benchmark: torch.cuda.empty_cache() diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index 047eb0688..6d1610084 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -1,45 +1,69 @@ - # usage: +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom # -# via deepspeed/zero-3 inference -# deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom +# to run benchmarks: +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark # -import torch -import deepspeed -import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, pipeline +# This is going to improve, but at the moment, the process is a bit cumbersome - we first use +# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, +# 2. free the allocated storage +# 3. start Deepspeed-Inference and only now load the checkpoint +# 4. run generate +# Done. +# + + +import glob from argparse import ArgumentParser -import os -from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.deepspeed import HfDeepSpeedConfig +from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock +import deepspeed +import io +import math +import sys +import json +import os +import gc +import torch import torch.distributed as dist +import time + +t_start = time.time() + +num_tokens = 100 parser = ArgumentParser() -parser.add_argument("--name", required=True, type=str) -parser.add_argument("--local_rank", required=False, type=int) -#parser.add_argument("--deepspeed", action="store_true") +parser.add_argument("--name", required=True, type=str, help="model_name") +parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") +parser.add_argument("--batch_size", default=1, type=int, help="batch size") +parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") +parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload") args = parser.parse_args() local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) -print( - "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" - .format(local_rank, - world_size)) - -config = AutoConfig.from_pretrained(args.name) -model_hidden_size = config.hidden_size +### Model loading and instantiating on GPU (via ZeRO) -train_batch_size = 1 * world_size model_name = args.name + +if local_rank == 0: + print(f"*** Loading the model {model_name}") + +tokenizer = AutoTokenizer.from_pretrained(model_name) +config = AutoConfig.from_pretrained(model_name) + +# XXX: can't automatically derive dtype via config's `from_pretrained` dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 -# Note: you need to edit nvme_path to an actual path on your filesystem where the model will be offloaded to +model_hidden_size = config.hidden_size +train_batch_size = 1 * world_size + ds_config = { "fp16": { "enabled": dtype == torch.float16, @@ -49,19 +73,11 @@ }, "zero_optimization": { "stage": 3, - "offload_param": { - "device": "nvme", - "nvme_path": "/mnt/nvme0/offload/", - "pin_memory": True, - "buffer_count": 4, - "buffer_size": 4e9, # for bloom, otherwise the default 1e8 should be enough - "fast_init": False - }, "overlap_comm": True, "contiguous_gradients": True, "reduce_bucket_size": model_hidden_size * model_hidden_size, "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, - "stage3_param_persistence_threshold": 10 * model_hidden_size + "stage3_param_persistence_threshold": 0 }, "steps_per_print": 2000, "train_batch_size": train_batch_size, @@ -69,37 +85,119 @@ "wall_clock_breakdown": False } -deepspeed.runtime.utils.see_memory_usage('pre-init', force=True) -#if args.deepspeed: -dschf = HfDeepSpeedConfig(ds_config) +if args.cpu_offload: + ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True) -model = AutoModelForCausalLM.from_pretrained(model_name) +dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus -#generator = pipeline('text-generation', model=args.name, device=local_rank, framework="pt") -deepspeed.runtime.utils.see_memory_usage('post-init', force=True) +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) -ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] -ds_engine.module.eval() -# generator.model = ds_engine.module -deepspeed.runtime.utils.see_memory_usage('post-ds-init', force=True) +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) -# if args.deepspeed: -# ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] -# ds_engine.module.eval() -# # generator.model = ds_engine.module -# deepspeed.runtime.utils.see_memory_usage('post-ds-init', force=True) -# else: -# dist.init_process_group("nccl") -# model = model.to(device=local_rank) +if args.benchmark: + deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) -#response = generator('DeepSpeed is', min_length=50, max_length=50, do_sample=False) +model = model.eval() -text_in = 'DeepSpeed is' +rank = dist.get_rank() -tokenizer = AutoTokenizer.from_pretrained(model_name) -inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank) -with torch.no_grad(): - #model = ds_engine.module if args.deepspeed else model - outputs = model.generate(inputs, synced_gpus=True, min_length=50, max_length=50, do_sample=False) -text_out = tokenizer.decode(outputs[0], skip_special_tokens=True) -print(f"in={text_in}\nout={text_out}") +if rank == 0: + print(ds_config) + +ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +ds_engine.module.eval() +model = ds_engine.module + +if args.benchmark: + t_ready = time.time() + + +### Generate + +if rank == 0: + print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") + +input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way" +] + +if args.batch_size > len(input_sentences): + # dynamically extend to support larger bs by repetition + input_sentences *= math.ceil(args.batch_size / len(input_sentences)) + +#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) +generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) +if rank == 0: + print(f"Generate args {generate_kwargs}") +inputs = input_sentences[:args.batch_size] +def generate(): + """ returns a list of pairs of inputs and outputs """ + + tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + + for t in tokens: + if torch.is_tensor(tokens[t]): + tokens[t] = tokens[t].to(torch.cuda.current_device()) + + greedy_output = model.generate(**tokens, **generate_kwargs) + + outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) + + return zip(inputs, outputs) + + +# warmup is a must if measuring speed as it's when all the optimizations are performed +# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs +pairs = generate() +if rank == 0: + for i,o in pairs: + print(f"{'-'*60}\nin={i}\nout={o}\n") + +if args.benchmark: + # make sure one generate is run earlier as a warmup + t_generate_start = time.time() + _ = generate() + t_generate_span = time.time() - t_generate_start + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) + +### Benchmark + +# benchmark it! +if args.benchmark: + if rank == 0: + print(f"*** Running benchmark") + + # warm up + for i in range(1): + _ = generate() + torch.cuda.synchronize() + + # benchmark + t0 = time.time() + cycles = 5 + for i in range(cycles): + _ = generate() + torch.cuda.synchronize() + if rank == 0: + througput = (time.time() - t0)/(cycles*num_tokens*args.batch_size) + print(f""" +*** Performance stats: +Throughput per token including tokenize: {througput*1000:.2f} msecs +Start to ready to generate: {t_ready - t_start:.3f} secs +Tokenize and generate {num_tokens*args.batch_size} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Start to finish: {t_ready - t_start + t_generate_span:.3f} secs +""") From 695265d481d7b066e8c1626d734f7685aeeae2f1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 14:10:50 -0700 Subject: [PATCH 37/55] wip --- scripts/inference/README.md | 52 +++++++++++++++++--- scripts/inference/bloom-ds-inference.py | 4 +- scripts/inference/bloom-ds-zero-inference.py | 6 ++- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index b8b80fc83..10cfdf727 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -6,21 +6,19 @@ Tensor-Parallelism and efficient fused CUDA kernels: https://www.deepspeed.ai/tutorials/inference-tutorial/ ``` -deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom +deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --batch_size 1 ``` Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay) - just a batch of 1 (would be more efficient to run a larger batch) Adding `--benchmark` to activate the benchmarks + +BS=1 ``` -deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --benchmark +deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark +[...] -*** Performance stats: -Throughput per token: 40.73 msecs -Start to ready to generate: 673.429 secs -Tokenize and generate 100 tokens: 4.089 secs -Start to finish: 677.518 secs ``` While processing memory per process: @@ -29,10 +27,48 @@ While processing memory per process: - CPU: ~10GB +BS=8 +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 8 --benchmark +[...] + +``` + +BS=64 + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 64 --benchmark + + +``` + + ## Deepspeed ZeRO-Inference https://www.deepspeed.ai/tutorials/zero/ ``` -deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom +$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark +[...] + + + + +``` + +``` +$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark +[...] + + + +``` + + +``` +$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 64 --benchmark +[...] + + + ``` diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index b7e212838..2a5bea5cb 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -278,8 +278,8 @@ def write_checkponts_json(): # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) -#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) -generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) +generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) +#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) if rank == 0: print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index 6d1610084..d36c177bc 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -135,8 +135,8 @@ # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) -#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) -generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) +generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) +#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) if rank == 0: print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] @@ -169,6 +169,8 @@ def generate(): _ = generate() t_generate_span = time.time() - t_generate_start +# XXX: this is currently doing 8 streams on 8 gpus, so we can feed it different inputs on each! + if args.benchmark: torch.cuda.empty_cache() gc.collect() From 1a7e891b0babdb7864f794281eb81b8776e3a98f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 18:18:23 -0700 Subject: [PATCH 38/55] improvements --- scripts/inference/README.md | 56 ++++++++++++++++---- scripts/inference/bloom-ds-inference.py | 13 +++-- scripts/inference/bloom-ds-zero-inference.py | 15 +++--- 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 10cfdf727..5fe02f906 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -1,10 +1,22 @@ # Inference scripts for BLOOM + ## Deepspeed-Inference Tensor-Parallelism and efficient fused CUDA kernels: https://www.deepspeed.ai/tutorials/inference-tutorial/ +### Setup + +``` +git clone https://github.com/microsoft/DeepSpeed +cd DeepSpeed +git checkout ds-inference/bloom-support +pip install . +``` + +### Run + ``` deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --batch_size 1 ``` @@ -16,7 +28,7 @@ Adding `--benchmark` to activate the benchmarks BS=1 ``` -deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark +deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-inference_bs=1.txt [...] ``` @@ -29,15 +41,19 @@ While processing memory per process: BS=8 ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 8 --benchmark +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-inference_bs=8.txt [...] - +*** Performance stats: +Throughput per token including tokenize: 5.23 msecs +Start to ready to generate: 683.397 secs +Tokenize and generate 800 (bs=8) tokens: 4.241 secs +Start to finish: 687.638 secs ``` BS=64 ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 64 --benchmark +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 64 --benchmark 2>&1 | tee bloom-ds-inference_bs=64.txt ``` @@ -47,26 +63,48 @@ $ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscien https://www.deepspeed.ai/tutorials/zero/ +### Setup + +``` +pip install deepspeed ``` -$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark -[...] +### Run +Note that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference. -``` ``` -$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark +$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt [...] +*** Performance stats: +Throughput per token including tokenize: 2258.26 msecs +Start to ready to generate: 463.870 secs +Tokenize and generate 100 (bs=1) tokens: 226.826 secs +Start to finish: 690.695 secs +``` +divided by 8, it's about 282 msec / token ``` +$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt +[...] + +Throughput per token including tokenize: 275.91 msecs +Start to ready to generate: 468.254 secs +Tokenize and generate 800 (bs=8) tokens: 221.315 secs +Start to finish: 689.569 secs + +``` + +divided by 8, it's about 4.6 msec / token + ``` -$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 64 --benchmark +$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 64 --benchmark " 2>&1 | tee bloom-ds-zero-inference_bs=64.txt [...] diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 2a5bea5cb..770d8c35b 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -206,6 +206,10 @@ def ds_clear_params(ds_engine): gc.collect() deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True) + + +### Deepspeed-Inference Loading + checkpoints_json = "checkpoints.json" def write_checkponts_json(): @@ -301,16 +305,15 @@ def generate(): # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs +_ = generate() + +t_generate_start = time.time() pairs = generate() +t_generate_span = time.time() - t_generate_start if rank == 0: for i,o in pairs: print(f"{'-'*60}\nin={i}\nout={o}\n") -if args.benchmark: - # make sure one generate is run earlier as a warmup - t_generate_start = time.time() - _ = generate() - t_generate_span = time.time() - t_generate_start if args.benchmark: torch.cuda.empty_cache() diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index d36c177bc..169699f7f 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -155,21 +155,19 @@ def generate(): return zip(inputs, outputs) +# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs +_ = generate() + +t_generate_start = time.time() pairs = generate() +t_generate_span = time.time() - t_generate_start if rank == 0: for i,o in pairs: print(f"{'-'*60}\nin={i}\nout={o}\n") -if args.benchmark: - # make sure one generate is run earlier as a warmup - t_generate_start = time.time() - _ = generate() - t_generate_span = time.time() - t_generate_start - -# XXX: this is currently doing 8 streams on 8 gpus, so we can feed it different inputs on each! if args.benchmark: torch.cuda.empty_cache() @@ -195,7 +193,8 @@ def generate(): _ = generate() torch.cuda.synchronize() if rank == 0: - througput = (time.time() - t0)/(cycles*num_tokens*args.batch_size) + # note that dividing by world_size as well as we can have world_size streams + througput = (time.time() - t0)/(cycles*num_tokens*args.batch_size*world_size) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs From aba4055a5abc3780fa3e0764867b7ed8e2fac0d0 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 21:34:02 -0700 Subject: [PATCH 39/55] fixes --- scripts/inference/README.md | 36 +++++++++----------- scripts/inference/bloom-ds-zero-inference.py | 5 +-- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 5fe02f906..23cdcd00f 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -18,7 +18,7 @@ pip install . ### Run ``` -deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --batch_size 1 +deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom ``` Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay) - just a batch of 1 (would be more efficient to run a larger batch) @@ -76,37 +76,33 @@ Note that the script currently runs the same inputs on all GPUs, but you can run ``` -$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt [...] - *** Performance stats: -Throughput per token including tokenize: 2258.26 msecs -Start to ready to generate: 463.870 secs -Tokenize and generate 100 (bs=1) tokens: 226.826 secs -Start to finish: 690.695 secs +Throughput per token including tokenize: 282.93 msecs +Start to ready to generate: 501.871 secs +Tokenize and generate 800 (bs=1) tokens: 226.188 secs +Start to finish: 728.060 secs ``` -divided by 8, it's about 282 msec / token + ``` -$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt [...] -Throughput per token including tokenize: 275.91 msecs -Start to ready to generate: 468.254 secs -Tokenize and generate 800 (bs=8) tokens: 221.315 secs -Start to finish: 689.569 secs - +*** Performance stats: +Throughput per token including tokenize: 34.57 msecs +Start to ready to generate: 482.132 secs +Tokenize and generate 6400 (bs=8) tokens: 221.236 secs +Start to finish: 703.368 secs ``` -divided by 8, it's about 4.6 msec / token - - +BS=16 and higher OOMs ``` -$ deepspeed --num_gpus 8 bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 64 --benchmark " 2>&1 | tee bloom-ds-zero-inference_bs=64.txt +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt [...] - - +OOM ``` diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index 169699f7f..d5c2ee83f 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -194,11 +194,12 @@ def generate(): torch.cuda.synchronize() if rank == 0: # note that dividing by world_size as well as we can have world_size streams - througput = (time.time() - t0)/(cycles*num_tokens*args.batch_size*world_size) + tokens_in_cycle_total = num_tokens*args.batch_size*world_size + througput = (time.time() - t0)/(cycles*tokens_in_cycle_total) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {num_tokens*args.batch_size} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Tokenize and generate {tokens_in_cycle_total} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) From 5e92d552a4a592e3a3f2983432d3559af11b0596 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 14 Jul 2022 21:36:37 -0700 Subject: [PATCH 40/55] fixes --- scripts/inference/README.md | 3 +++ scripts/inference/bloom-ds-inference.py | 5 +++-- scripts/inference/bloom-ds-zero-inference.py | 6 +++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 23cdcd00f..9c0492031 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -75,6 +75,8 @@ pip install deepspeed Note that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference. +BS=1 + ``` $ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt [...] @@ -86,6 +88,7 @@ Start to finish: 728.060 secs ``` +BS=8 ``` $ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 770d8c35b..5fcf1f167 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -339,11 +339,12 @@ def generate(): _ = generate() torch.cuda.synchronize() if rank == 0: - througput = (time.time() - t0)/(cycles*num_tokens*args.batch_size) + tokens_in_cycle = num_tokens * args.batch_size + througput = (time.time() - t0)/(cycles * tokens_in_cycle) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {num_tokens*args.batch_size} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Tokenize and generate {tokens_in_cycle} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index d5c2ee83f..8b0ea7d2d 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -194,12 +194,12 @@ def generate(): torch.cuda.synchronize() if rank == 0: # note that dividing by world_size as well as we can have world_size streams - tokens_in_cycle_total = num_tokens*args.batch_size*world_size - througput = (time.time() - t0)/(cycles*tokens_in_cycle_total) + tokens_in_cycle = num_tokens * args.batch_size * world_size + througput = (time.time() - t0)/(cycles * tokens_in_cycle) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {tokens_in_cycle_total} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Tokenize and generate {tokens_in_cycle} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) From 399211224aa5c759e81385ca28f5d7a3d8f246f3 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 15 Jul 2022 11:43:33 -0700 Subject: [PATCH 41/55] add accelerate script --- scripts/inference/README.md | 36 ++++- .../inference/bloom-accelerate-inference.py | 146 ++++++++++++++++++ scripts/inference/bloom-ds-inference.py | 10 +- scripts/inference/bloom-ds-zero-inference.py | 10 +- 4 files changed, 191 insertions(+), 11 deletions(-) create mode 100644 scripts/inference/bloom-accelerate-inference.py diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 9c0492031..6e32326b2 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -1,6 +1,8 @@ # Inference scripts for BLOOM + + ## Deepspeed-Inference Tensor-Parallelism and efficient fused CUDA kernels: @@ -28,7 +30,7 @@ Adding `--benchmark` to activate the benchmarks BS=1 ``` -deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-inference_bs=1.txt +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-inference_bs=1.txt [...] ``` @@ -56,6 +58,8 @@ BS=64 $ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 64 --benchmark 2>&1 | tee bloom-ds-inference_bs=64.txt + + ``` @@ -109,3 +113,33 @@ $ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name big OOM ``` + + + +## HF Accelerate + +https://github.com/huggingface/accelerate + +### Setup + +``` +pip install transformers +``` + + +BS=1 + + +### Run + + + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt +[...] +*** Performance stats: +Throughput per token including tokenize: 282.93 msecs +Start to ready to generate: 501.871 secs +Tokenize and generate 800 (bs=1) tokens: 226.188 secs +Start to finish: 728.060 secs +``` diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py new file mode 100644 index 000000000..a5ca15e40 --- /dev/null +++ b/scripts/inference/bloom-accelerate-inference.py @@ -0,0 +1,146 @@ +import argparse +import time +import os +import gc +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") + parser.add_argument("--name", type=str, help="Name path", required=True) + parser.add_argument("--batch_size", default=1, type=int, help="batch size") + parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") + parser.add_argument("--greedy", action="store_true") + parser.add_argument("--top-k", type=int, default=0) + parser.add_argument("--top-p", type=float, default=0.) + + return parser.parse_args() + +def get_max_memory_per_gpu_dict(): + max_memory_per_gpu = torch.cuda.get_device_properties(0).total_memory // 2**30 + return {i: f"{max_memory_per_gpu}GIB" for i in range(torch.cuda.device_count())} + +t_start = time.time() + +num_tokens = 100 + +args = get_args() + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + +rank = local_rank + +model_name = args.name +print(f"Loading model {model_name}") + + +tokenizer = AutoTokenizer.from_pretrained(model_name) + +# XXX: can't automatically derive dtype via config's `from_pretrained` +dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 + +model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map="auto", + max_memory=get_max_memory_per_gpu_dict(), + torch_dtype=dtype, +) + + +if args.benchmark: + t_ready = time.time() + + + +### Generate + +if rank == 0: + print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") + +input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way" +] + +if args.batch_size > len(input_sentences): + # dynamically extend to support larger bs by repetition + input_sentences *= math.ceil(args.batch_size / len(input_sentences)) + +generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) +#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) + +#top_k=None if greedy else top_k, +#top_p=None if greedy else top_p + +if rank == 0: + print(f"Generate args {generate_kwargs}") +inputs = input_sentences[:args.batch_size] +def generate(): + """ returns a list of pairs of inputs and outputs """ + + tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in tokens: + if torch.is_tensor(tokens[t]): + tokens[t] = tokens[t].to("cuda:0") + + greedy_output = model.generate(**tokens, **generate_kwargs) + + outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) + + return zip(inputs, outputs) + +# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size + +# warmup is a must if measuring speed as it's when all the optimizations are performed +# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs +_ = generate() + +t_generate_start = time.time() +pairs = generate() +t_generate_span = time.time() - t_generate_start +if rank == 0: + for i,o in pairs: + print(f"{'-'*60}\nin={i}\nout={o}\n") + + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + +### Benchmark + +# benchmark it! +if args.benchmark: + if rank == 0: + print(f"*** Running benchmark") + + # warm up + for i in range(1): + _ = generate() + torch.cuda.synchronize() + + # benchmark + t0 = time.time() + cycles = 5 + for i in range(cycles): + _ = generate() + torch.cuda.synchronize() + if rank == 0: + # note that dividing by world_size as well as we can have world_size streams + tokens_in_cycle = num_tokens * args.batch_size * world_size + througput = (time.time() - t0)/(cycles * tokens_in_cycle) + print(f""" +*** Performance stats: +Throughput per token including tokenize: {througput*1000:.2f} msecs +Start to ready to generate: {t_ready - t_start:.3f} secs +Tokenize and generate {tokens_in_cycle} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Start to finish: {t_ready - t_start + t_generate_span:.3f} secs +""") diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 5fcf1f167..c553750db 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -15,21 +15,21 @@ # -import glob from argparse import ArgumentParser from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.deepspeed import HfDeepSpeedConfig from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock import deepspeed +import gc +import glob import io -import math -import sys import json +import math import os -import gc +import sys +import time import torch import torch.distributed as dist -import time t_start = time.time() diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index 8b0ea7d2d..5a776d1b9 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -15,21 +15,21 @@ # -import glob from argparse import ArgumentParser from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from transformers.deepspeed import HfDeepSpeedConfig from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock import deepspeed +import gc +import glob import io -import math -import sys import json +import math import os -import gc +import sys +import time import torch import torch.distributed as dist -import time t_start = time.time() From 5a7057b0f62d678aa32dc27a15eb9a2487f26e9c Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 15 Jul 2022 11:50:08 -0700 Subject: [PATCH 42/55] fix --- scripts/inference/bloom-accelerate-inference.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index a5ca15e40..bf42a0281 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -33,7 +33,8 @@ def get_max_memory_per_gpu_dict(): rank = local_rank model_name = args.name -print(f"Loading model {model_name}") +if rank == 0: + print(f"Loading model {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -135,7 +136,7 @@ def generate(): torch.cuda.synchronize() if rank == 0: # note that dividing by world_size as well as we can have world_size streams - tokens_in_cycle = num_tokens * args.batch_size * world_size + tokens_in_cycle = num_tokens * args.batch_size througput = (time.time() - t0)/(cycles * tokens_in_cycle) print(f""" *** Performance stats: From 47585312451cd358e6b85dcb98eb105063d016ce Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 15 Jul 2022 22:18:49 -0700 Subject: [PATCH 43/55] wip --- scripts/inference/README.md | 36 +++++++++++++++---- .../inference/bloom-accelerate-inference.py | 31 +++++++++++++--- 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 6e32326b2..004112b87 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -62,6 +62,15 @@ $ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscien ``` +BS=128 + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 128 --benchmark 2>&1 | tee bloom-ds-inference_bs=128.txt + + + + +``` ## Deepspeed ZeRO-Inference @@ -127,19 +136,32 @@ pip install transformers ``` + +### Run + + + + BS=1 +``` +$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt +[...] -### Run +``` + +BS=8 +``` +$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt +[...] +``` +BS=16 ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt +$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt [...] -*** Performance stats: -Throughput per token including tokenize: 282.93 msecs -Start to ready to generate: 501.871 secs -Tokenize and generate 800 (bs=1) tokens: 226.188 secs -Start to finish: 728.060 secs + + ``` diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index bf42a0281..dd2a07928 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -17,9 +17,29 @@ def get_args(): return parser.parse_args() -def get_max_memory_per_gpu_dict(): - max_memory_per_gpu = torch.cuda.get_device_properties(0).total_memory // 2**30 - return {i: f"{max_memory_per_gpu}GIB" for i in range(torch.cuda.device_count())} +def get_max_memory_per_gpu_dict(dtype): + # figure out the memory map - the minimum per gpu required to load the model + n_gpus = torch.cuda.device_count() + + # hardcode for now for bloom + params = 179 * 2**30 + # XXX: how to figure out model size w/o having a model object yet? + #params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + + bytes = torch.finfo(dtype).bits / 8 + param_memory_total_in_bytes = params * bytes + # add 10% since weights sizes aren't the same and some gpu may need more memory + param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.1) + print(f"Estimating {param_memory_per_gpu_in_bytes/2**30}GB per gpu for weights") + + # check the real available memory + # load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB) + torch.ones(1).cuda() + max_memory_per_gpu_in_bytes = torch.cuda.mem_get_info(0)[0] + if max_memory_per_gpu_in_bytes < param_memory_per_gpu_in_bytes: + raise ValueError(f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") + + return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} t_start = time.time() @@ -42,10 +62,13 @@ def get_max_memory_per_gpu_dict(): # XXX: can't automatically derive dtype via config's `from_pretrained` dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 +#print(get_max_memory_per_gpu_dict()) + + model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", - max_memory=get_max_memory_per_gpu_dict(), + max_memory=get_max_memory_per_gpu_dict(dtype), torch_dtype=dtype, ) From 7550ee06f1e0cf77d35757223e277d21812ed4dc Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 15 Jul 2022 22:28:05 -0700 Subject: [PATCH 44/55] wip --- scripts/inference/bloom-accelerate-inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index dd2a07928..cbb8e3d18 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -3,6 +3,7 @@ import os import gc import torch +import math from transformers import AutoTokenizer, AutoModelForCausalLM def get_args(): From 5153c402e89d246d739f70233c164ae6ae748734 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sun, 17 Jul 2022 20:32:03 -0700 Subject: [PATCH 45/55] stats --- scripts/inference/README.md | 29 +++++++++++++++++++ .../inference/bloom-accelerate-inference.py | 6 ++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index 004112b87..d62b87350 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -1,6 +1,35 @@ # Inference scripts for BLOOM +## BLOOM Inference solutions +Here are some stats on JeanZay's 8x80GB A100 node w/ 512GB of CPU memory: + +All benchmarks are doing greedy generation of 100 token outputs: +``` +Generate args {'min_length': 100, 'max_length': 100, 'do_sample': False} +``` +The inputs are just a few tokens. + +Throughput in msecs: + +| project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | +| :----------- | :---- | :---- | :---- | :---- | :---- | :--- | +| accelerate | 230.38 | 31.78 | 17.84 | 10.89 | oom | omm | +| ds-inference | 40.57 | 5.23 | | | 2.77 | 0.66 | +| ds-zero | 283 | 34.88 | oom | oom | oom | oom | + + +Start to ready to generate in secs: + +| project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | +| :----------- | :--- | :--- | :--- | :--- | :--- | :--- | +| accelerate | 121 | 120 | 113 | 118 | | | +| ds-inference | 662 | 673 | | | 685 | 654 | +| ds-zero | 462 | 463 | | | | | +| | | | | | | | + + +DS-Inference load time (start to ready to generate) will become much faster soon. Once we stop relying on ds-zero to instantiate the model on gpu. The plan is to pre-shard the weights TP-wise for 8x and 16x gpus and load them directly on each gpu. Will probably be under 1min. ## Deepspeed-Inference diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index cbb8e3d18..7893f2fbb 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -29,9 +29,9 @@ def get_max_memory_per_gpu_dict(dtype): bytes = torch.finfo(dtype).bits / 8 param_memory_total_in_bytes = params * bytes - # add 10% since weights sizes aren't the same and some gpu may need more memory - param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.1) - print(f"Estimating {param_memory_per_gpu_in_bytes/2**30}GB per gpu for weights") + # add 5% since weight sizes aren't the same and some GPU may need more memory + param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.05) + print(f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") # check the real available memory # load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB) From cb50ea5961748533c612eb1398e956719ba3759e Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Mon, 18 Jul 2022 18:22:35 -0700 Subject: [PATCH 46/55] add OnDevice and remove zero-inference (#316) --- scripts/inference/bloom-ds-inference.py | 63 ++----------------------- 1 file changed, 4 insertions(+), 59 deletions(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index c553750db..034e62450 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -41,12 +41,12 @@ parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") parser.add_argument("--batch_size", default=1, type=int, help="batch size") parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") -parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload") args = parser.parse_args() local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) +deepspeed.init_distributed('nccl') ### Model loading and instantiating on GPU (via ZeRO) @@ -132,44 +132,14 @@ def get_checkpoint_files(pretrained_model_name_or_path): else: dtype = torch.bfloat16 -#dtype = config.dtype -#print(dtype) - -model_hidden_size = config.hidden_size -train_batch_size = 1 * world_size - -ds_config = { - "fp16": { - "enabled": dtype == torch.float16, - }, - "bf16": { - "enabled": dtype == torch.bfloat16, - }, - "zero_optimization": { - "stage": 3, - "overlap_comm": True, - "contiguous_gradients": True, - "reduce_bucket_size": model_hidden_size * model_hidden_size, - "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, - "stage3_param_persistence_threshold": 0 - }, - "steps_per_print": 2000, - "train_batch_size": train_batch_size, - "train_micro_batch_size_per_gpu": 1, - "wall_clock_breakdown": False -} - -if args.cpu_offload: - ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True) - -dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus - if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) -model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16) +# Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load +with deepspeed.OnDevice(dtype=dtype, device='meta'): + model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16) if args.benchmark: deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) @@ -178,36 +148,11 @@ def get_checkpoint_files(pretrained_model_name_or_path): rank = dist.get_rank() -if rank == 0: - print(ds_config) - -ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] -ds_engine.module.eval() -model = ds_engine.module - -### Deepspeed-ZeRO Unloading - -# a must to remove ZeRO-installed hooks! -ds_engine.destroy() - -# free GPU storage used by ZeRO -from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus -def ds_clear_params(ds_engine): - for p in ds_engine.parameters(): - if hasattr(p, "ds_tensor"): - p.ds_tensor = torch.empty(0, dtype=p.dtype, device=p.device) - p.ds_status = ZeroParamStatus.NOT_AVAILABLE - -ds_clear_params(ds_engine) -del ds_engine - if args.benchmark: torch.cuda.empty_cache() gc.collect() deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True) - - ### Deepspeed-Inference Loading checkpoints_json = "checkpoints.json" From a53fcaa5a3874fe40b313599ce273df63561dfb4 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 19 Jul 2022 19:13:03 +0200 Subject: [PATCH 47/55] wip --- scripts/inference/bloom-accelerate-inference.py | 14 ++++++++++++-- scripts/inference/bloom-ds-inference.py | 11 ++++++----- scripts/inference/bloom-ds-zero-inference.py | 4 ++-- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index 7893f2fbb..c8d969b31 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -19,6 +19,17 @@ def get_args(): return parser.parse_args() def get_max_memory_per_gpu_dict(dtype): + + # works with bs=40 9.48msec + # return {0: '12GIB', 1: '46GIB', 2: '46GIB', 3: '46GIB', 4: '46GIB', 5: '46GIB', 6: '46GIB', 7: '46GIB'} + + # works with bs=40 9.47 + #return {0: '0GIB', 1: '60GIB', 2: '60GIB', 3: '60GIB', 4: '60GIB', 5: '60GIB', 6: '60GIB', 7: '60GIB'} + + #return {0: '0GIB', 1: '52GIB', 2: '52GIB', 3: '52GIB', 4: '52GIB', 5: '52GIB', 6: '52GIB', 7: '52GIB'} + return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} + #return {0: '0GIB', 1: '49GIB', 2: '49GIB', 3: '49GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} + # figure out the memory map - the minimum per gpu required to load the model n_gpus = torch.cuda.device_count() @@ -99,8 +110,7 @@ def get_max_memory_per_gpu_dict(dtype): # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) -generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) -#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) +generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) #top_k=None if greedy else top_k, #top_p=None if greedy else top_p diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 034e62450..968440ebc 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -47,6 +47,8 @@ world_size = int(os.getenv('WORLD_SIZE', '1')) deepspeed.init_distributed('nccl') +rank = dist.get_rank() + ### Model loading and instantiating on GPU (via ZeRO) @@ -110,7 +112,7 @@ def get_checkpoint_files(pretrained_model_name_or_path): #print(get_checkpoint_files(model_name)) -if local_rank == 0: +if rank == 0: print(f"*** Loading the model {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -146,7 +148,6 @@ def get_checkpoint_files(pretrained_model_name_or_path): model = model.eval() -rank = dist.get_rank() if args.benchmark: torch.cuda.empty_cache() @@ -164,7 +165,7 @@ def write_checkponts_json(): #checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") checkpoint_files = get_checkpoint_files(model_name) - print("Checkpoint files:", checkpoint_files) + #print("Checkpoint files:", checkpoint_files) data = { "type": "BLOOM-176B", @@ -227,8 +228,8 @@ def write_checkponts_json(): # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) -generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) -#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) +generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) + if rank == 0: print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index 5a776d1b9..5fd192555 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -135,8 +135,8 @@ # dynamically extend to support larger bs by repetition input_sentences *= math.ceil(args.batch_size / len(input_sentences)) -generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) -#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=True) +generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) + if rank == 0: print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] From 7252879759e40ec14e5ea236924fccac70e14fa9 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 19 Jul 2022 11:37:23 -0700 Subject: [PATCH 48/55] rework generate + benchmark --- .../inference/bloom-accelerate-inference.py | 38 +++++++++---------- scripts/inference/bloom-ds-inference.py | 33 ++++++++-------- scripts/inference/bloom-ds-zero-inference.py | 33 +++++++++------- 3 files changed, 55 insertions(+), 49 deletions(-) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index c8d969b31..918c8599c 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -112,37 +112,36 @@ def get_max_memory_per_gpu_dict(dtype): generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) -#top_k=None if greedy else top_k, -#top_p=None if greedy else top_p - if rank == 0: print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] def generate(): - """ returns a list of pairs of inputs and outputs """ + """ returns a list of zipped inputs, outputs and number of new tokens """ - tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) - for t in tokens: - if torch.is_tensor(tokens[t]): - tokens[t] = tokens[t].to("cuda:0") + input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to("cuda:0") - greedy_output = model.generate(**tokens, **generate_kwargs) + outputs = model.generate(**input_tokens, **generate_kwargs) - outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) + input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_tokens_lengths = [x.shape[0] for x in outputs] - return zip(inputs, outputs) + total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) -# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size + return zip(inputs, outputs, total_new_tokens) # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs _ = generate() t_generate_start = time.time() -pairs = generate() +generated = generate() t_generate_span = time.time() - t_generate_start if rank == 0: - for i,o in pairs: + for i,o,_ in generated: print(f"{'-'*60}\nin={i}\nout={o}\n") @@ -152,7 +151,6 @@ def generate(): ### Benchmark -# benchmark it! if args.benchmark: if rank == 0: print(f"*** Running benchmark") @@ -165,17 +163,17 @@ def generate(): # benchmark t0 = time.time() cycles = 5 + total_new_tokens_generated = 0 for i in range(cycles): - _ = generate() + generated = generate() + total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) torch.cuda.synchronize() if rank == 0: - # note that dividing by world_size as well as we can have world_size streams - tokens_in_cycle = num_tokens * args.batch_size - througput = (time.time() - t0)/(cycles * tokens_in_cycle) + througput = (time.time() - t0)/(cycles * total_new_tokens_generated) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {tokens_in_cycle} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 968440ebc..d11c0ddbb 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -234,19 +234,22 @@ def write_checkponts_json(): print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] def generate(): - """ returns a list of pairs of inputs and outputs """ + """ returns a list of zipped inputs, outputs and number of new tokens """ - tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) - for t in tokens: - if torch.is_tensor(tokens[t]): - tokens[t] = tokens[t].to(torch.cuda.current_device()) + outputs = model.generate(**input_tokens, **generate_kwargs) - greedy_output = model.generate(**tokens, **generate_kwargs) + input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_tokens_lengths = [x.shape[0] for x in outputs] - outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) + total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) - return zip(inputs, outputs) + return zip(inputs, outputs, total_new_tokens) # warmup is a must if measuring speed as it's when all the optimizations are performed @@ -254,13 +257,12 @@ def generate(): _ = generate() t_generate_start = time.time() -pairs = generate() +generated = generate() t_generate_span = time.time() - t_generate_start if rank == 0: - for i,o in pairs: + for i,o,_ in generated: print(f"{'-'*60}\nin={i}\nout={o}\n") - if args.benchmark: torch.cuda.empty_cache() gc.collect() @@ -281,16 +283,17 @@ def generate(): # benchmark t0 = time.time() cycles = 5 + total_new_tokens_generated = 0 for i in range(cycles): - _ = generate() + generated = generate() + total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) torch.cuda.synchronize() if rank == 0: - tokens_in_cycle = num_tokens * args.batch_size - througput = (time.time() - t0)/(cycles * tokens_in_cycle) + througput = (time.time() - t0)/(cycles * total_new_tokens_generated) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {tokens_in_cycle} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index 5fd192555..f64f627f1 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -141,19 +141,22 @@ print(f"Generate args {generate_kwargs}") inputs = input_sentences[:args.batch_size] def generate(): - """ returns a list of pairs of inputs and outputs """ + """ returns a list of zipped inputs, outputs and number of new tokens """ - tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to("cuda:0") - for t in tokens: - if torch.is_tensor(tokens[t]): - tokens[t] = tokens[t].to(torch.cuda.current_device()) + outputs = model.generate(**input_tokens, **generate_kwargs) - greedy_output = model.generate(**tokens, **generate_kwargs) + input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_tokens_lengths = [x.shape[0] for x in outputs] - outputs = tokenizer.batch_decode(greedy_output, skip_special_tokens=True) + total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) - return zip(inputs, outputs) + return zip(inputs, outputs, total_new_tokens) # XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size @@ -165,7 +168,7 @@ def generate(): pairs = generate() t_generate_span = time.time() - t_generate_start if rank == 0: - for i,o in pairs: + for i,o,_ in pairs: print(f"{'-'*60}\nin={i}\nout={o}\n") @@ -176,7 +179,6 @@ def generate(): ### Benchmark -# benchmark it! if args.benchmark: if rank == 0: print(f"*** Running benchmark") @@ -192,14 +194,17 @@ def generate(): for i in range(cycles): _ = generate() torch.cuda.synchronize() + total_new_tokens_generated = 0 if rank == 0: - # note that dividing by world_size as well as we can have world_size streams - tokens_in_cycle = num_tokens * args.batch_size * world_size - througput = (time.time() - t0)/(cycles * tokens_in_cycle) + generated = generate() + total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) + # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs) + total_new_tokens_generated *= world_size + througput = (time.time() - t0)/(cycles * total_new_tokens_generated) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {tokens_in_cycle} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) From 2aa419d4cba23fa6f463ebf061ed9f40172fd38f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 19 Jul 2022 12:19:13 -0700 Subject: [PATCH 49/55] figure out the memory map dynamically --- .../inference/bloom-accelerate-inference.py | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index 918c8599c..edb78eaa1 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -4,7 +4,7 @@ import gc import torch import math -from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM def get_args(): parser = argparse.ArgumentParser() @@ -18,28 +18,33 @@ def get_args(): return parser.parse_args() -def get_max_memory_per_gpu_dict(dtype): - - # works with bs=40 9.48msec - # return {0: '12GIB', 1: '46GIB', 2: '46GIB', 3: '46GIB', 4: '46GIB', 5: '46GIB', 6: '46GIB', 7: '46GIB'} - - # works with bs=40 9.47 - #return {0: '0GIB', 1: '60GIB', 2: '60GIB', 3: '60GIB', 4: '60GIB', 5: '60GIB', 6: '60GIB', 7: '60GIB'} - - #return {0: '0GIB', 1: '52GIB', 2: '52GIB', 3: '52GIB', 4: '52GIB', 5: '52GIB', 6: '52GIB', 7: '52GIB'} - return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} - #return {0: '0GIB', 1: '49GIB', 2: '49GIB', 3: '49GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} +def get_max_memory_per_gpu_dict(dtype, model_name): + """ try to generate the memory map based on what we know about the model and the available hardware """ # figure out the memory map - the minimum per gpu required to load the model n_gpus = torch.cuda.device_count() - # hardcode for now for bloom - params = 179 * 2**30 - # XXX: how to figure out model size w/o having a model object yet? - #params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + if model_name == "bigscience/bloom" and n_gpus == 8 and torch.cuda.get_device_properties(0).total_memory > 79*2**30: + # hand crafted optimized memory map for 8x80 setup over BLOOM + # this works with bs=48 + return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} + + try: + # model_params calculation, as we don't have a model yet to do: + #model_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + + config = AutoConfig.from_pretrained(model_name) + h = config.n_embed + l = config.n_layer + v = config.vocab_size + # from https://github.com/bigscience-workshop/bigscience/tree/a3e451498ee8189d2a9dd47be19aa89b0e16cd89/math#model-sizing + model_params = l*(12*h**2 + 13*h) + v*h + 4*h + except: + print(f"The model {model_name} has a broken config file. Please notify the owner") + raise bytes = torch.finfo(dtype).bits / 8 - param_memory_total_in_bytes = params * bytes + param_memory_total_in_bytes = model_params * bytes # add 5% since weight sizes aren't the same and some GPU may need more memory param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.05) print(f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") @@ -80,7 +85,7 @@ def get_max_memory_per_gpu_dict(dtype): model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", - max_memory=get_max_memory_per_gpu_dict(dtype), + max_memory=get_max_memory_per_gpu_dict(dtype, model_name), torch_dtype=dtype, ) From 4bd8ca5b4e5682313d96a8da2c58b854bb4cc9b5 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 19 Jul 2022 15:22:40 -0700 Subject: [PATCH 50/55] bug fix --- scripts/inference/bloom-accelerate-inference.py | 2 +- scripts/inference/bloom-ds-inference.py | 2 +- scripts/inference/bloom-ds-zero-inference.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index edb78eaa1..ee89ca6f5 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -174,7 +174,7 @@ def generate(): total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) torch.cuda.synchronize() if rank == 0: - througput = (time.time() - t0)/(cycles * total_new_tokens_generated) + througput = (time.time() - t0)/(total_new_tokens_generated) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index d11c0ddbb..3be81a27c 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -289,7 +289,7 @@ def generate(): total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) torch.cuda.synchronize() if rank == 0: - througput = (time.time() - t0)/(cycles * total_new_tokens_generated) + througput = (time.time() - t0)/(total_new_tokens_generated) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index f64f627f1..ebf5e8f7c 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -200,7 +200,7 @@ def generate(): total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs) total_new_tokens_generated *= world_size - througput = (time.time() - t0)/(cycles * total_new_tokens_generated) + througput = (time.time() - t0)/(total_new_tokens_generated) print(f""" *** Performance stats: Throughput per token including tokenize: {througput*1000:.2f} msecs From b76e516611f5a694c44b86a3e4306cdce09c767a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 19 Jul 2022 16:19:20 -0700 Subject: [PATCH 51/55] fix ds-zero-inference wrt device --- scripts/inference/bloom-ds-zero-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index ebf5e8f7c..36859d611 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -146,7 +146,7 @@ def generate(): input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to("cuda:0") + input_tokens[t] = input_tokens[t].to(device=rank) outputs = model.generate(**input_tokens, **generate_kwargs) From ecfd577161d41fdd087fbb1d415edf18883e34c8 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 20 Jul 2022 21:07:30 +0200 Subject: [PATCH 52/55] bug fix --- scripts/inference/bloom-ds-zero-inference.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py index 36859d611..043b4967f 100644 --- a/scripts/inference/bloom-ds-zero-inference.py +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -146,7 +146,7 @@ def generate(): input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(device=rank) + input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) outputs = model.generate(**input_tokens, **generate_kwargs) @@ -191,13 +191,13 @@ def generate(): # benchmark t0 = time.time() cycles = 5 - for i in range(cycles): - _ = generate() - torch.cuda.synchronize() total_new_tokens_generated = 0 - if rank == 0: + for i in range(cycles): generated = generate() total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) + + torch.cuda.synchronize() + if rank == 0: # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs) total_new_tokens_generated *= world_size througput = (time.time() - t0)/(total_new_tokens_generated) @@ -208,3 +208,4 @@ def generate(): Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs Start to finish: {t_ready - t_start + t_generate_span:.3f} secs """) + From fd26b9c4650c74e4159c7aed60a282176f87ac7f Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 20 Jul 2022 21:07:41 +0200 Subject: [PATCH 53/55] update --- scripts/inference/bloom-accelerate-inference.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index ee89ca6f5..415b2f765 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -26,7 +26,7 @@ def get_max_memory_per_gpu_dict(dtype, model_name): if model_name == "bigscience/bloom" and n_gpus == 8 and torch.cuda.get_device_properties(0).total_memory > 79*2**30: # hand crafted optimized memory map for 8x80 setup over BLOOM - # this works with bs=48 + # this works with bs=40 return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} try: @@ -37,7 +37,7 @@ def get_max_memory_per_gpu_dict(dtype, model_name): h = config.n_embed l = config.n_layer v = config.vocab_size - # from https://github.com/bigscience-workshop/bigscience/tree/a3e451498ee8189d2a9dd47be19aa89b0e16cd89/math#model-sizing + # from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing model_params = l*(12*h**2 + 13*h) + v*h + 4*h except: print(f"The model {model_name} has a broken config file. Please notify the owner") @@ -116,6 +116,8 @@ def get_max_memory_per_gpu_dict(dtype, model_name): input_sentences *= math.ceil(args.batch_size / len(input_sentences)) generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) +#generate_kwargs = dict(max_new_tokens=num_tokens, use_cache=False, do_sample=False) +#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) if rank == 0: print(f"Generate args {generate_kwargs}") From e2bfe9169de81f63d7013987b6e1888af561decb Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 22 Jul 2022 13:04:07 -0700 Subject: [PATCH 54/55] update --- scripts/inference/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/inference/README.md b/scripts/inference/README.md index d62b87350..44e98f9fb 100644 --- a/scripts/inference/README.md +++ b/scripts/inference/README.md @@ -42,7 +42,6 @@ https://www.deepspeed.ai/tutorials/inference-tutorial/ ``` git clone https://github.com/microsoft/DeepSpeed cd DeepSpeed -git checkout ds-inference/bloom-support pip install . ``` From b9a67ea59397c82eb8ee3ccb6ffe2fa51ea39f2b Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 9 Aug 2022 15:37:14 -0700 Subject: [PATCH 55/55] fix --- scripts/inference/bloom-ds-inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py index 3be81a27c..c21dfeb96 100644 --- a/scripts/inference/bloom-ds-inference.py +++ b/scripts/inference/bloom-ds-inference.py @@ -50,7 +50,7 @@ rank = dist.get_rank() -### Model loading and instantiating on GPU (via ZeRO) +### Model loading and instantiating on GPUs def get_checkpoint_files(pretrained_model_name_or_path): # XXX: I just hacked this one together to automatically handle the fetching of the model file or