From 47f3fc0c6a42ddac2af89cb83dc99cb5cec1e215 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Fri, 12 Aug 2022 15:48:38 +0530 Subject: [PATCH 01/32] first step towards making libs --- .../inference/bloom-accelerate-inference.py | 261 ++++++++---------- scripts/inference/utils.py | 187 +++++++++++++ 2 files changed, 309 insertions(+), 139 deletions(-) create mode 100644 scripts/inference/utils.py diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py index 415b2f765..19c9237d9 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom-accelerate-inference.py @@ -1,22 +1,32 @@ -import argparse -import time -import os import gc + import torch -import math -from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + +import utils +from utils import ( + Execute, + benchmark_generation, + generate, + get_argument_parser, + get_benchmark_results, + get_dummy_batch, + print_rank_n, + run_and_log_time, +) + def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") - parser.add_argument("--name", type=str, help="Name path", required=True) - parser.add_argument("--batch_size", default=1, type=int, help="batch size") - parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") - parser.add_argument("--greedy", action="store_true") - parser.add_argument("--top-k", type=int, default=0) - parser.add_argument("--top-p", type=float, default=0.) + parser = get_argument_parser() + + group = parser.add_argument_group(title="launch config") + group.add_argument("--benchmark_cycles", type=int, + default=0, help="additionally run benchmark") + + args = utils.get_args(parser) + + return args - return parser.parse_args() def get_max_memory_per_gpu_dict(dtype, model_name): """ try to generate the memory map based on what we know about the model and the available hardware """ @@ -58,129 +68,102 @@ def get_max_memory_per_gpu_dict(dtype, model_name): return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} -t_start = time.time() - -num_tokens = 100 - -args = get_args() - -local_rank = int(os.getenv('LOCAL_RANK', '0')) -world_size = int(os.getenv('WORLD_SIZE', '1')) - -rank = local_rank - -model_name = args.name -if rank == 0: - print(f"Loading model {model_name}") - - -tokenizer = AutoTokenizer.from_pretrained(model_name) - -# XXX: can't automatically derive dtype via config's `from_pretrained` -dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 - -#print(get_max_memory_per_gpu_dict()) - - -model = AutoModelForCausalLM.from_pretrained( - model_name, - device_map="auto", - max_memory=get_max_memory_per_gpu_dict(dtype, model_name), - torch_dtype=dtype, -) - -if args.benchmark: - t_ready = time.time() - - - -### Generate - -if rank == 0: - print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") - -input_sentences = [ - "DeepSpeed is a machine learning framework", - "He is working on", - "He has a", - "He got all", - "Everyone is happy and I can", - "The new movie that got Oscar this year", - "In the far far distance from our galaxy,", - "Peace is the only way" -] - -if args.batch_size > len(input_sentences): - # dynamically extend to support larger bs by repetition - input_sentences *= math.ceil(args.batch_size / len(input_sentences)) - -generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) -#generate_kwargs = dict(max_new_tokens=num_tokens, use_cache=False, do_sample=False) -#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) - -if rank == 0: - print(f"Generate args {generate_kwargs}") -inputs = input_sentences[:args.batch_size] -def generate(): - """ returns a list of zipped inputs, outputs and number of new tokens """ - - input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to("cuda:0") - - outputs = model.generate(**input_tokens, **generate_kwargs) - - input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] - output_tokens_lengths = [x.shape[0] for x in outputs] - - total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] - outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) - - return zip(inputs, outputs, total_new_tokens) - -# warmup is a must if measuring speed as it's when all the optimizations are performed -# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs -_ = generate() - -t_generate_start = time.time() -generated = generate() -t_generate_span = time.time() - t_generate_start -if rank == 0: - for i,o,_ in generated: - print(f"{'-'*60}\nin={i}\nout={o}\n") - - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - -### Benchmark - -if args.benchmark: - if rank == 0: - print(f"*** Running benchmark") - - # warm up - for i in range(1): - _ = generate() - torch.cuda.synchronize() - - # benchmark - t0 = time.time() - cycles = 5 - total_new_tokens_generated = 0 - for i in range(cycles): - generated = generate() - total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) - torch.cuda.synchronize() - if rank == 0: - througput = (time.time() - t0)/(total_new_tokens_generated) - print(f""" -*** Performance stats: -Throughput per token including tokenize: {througput*1000:.2f} msecs -Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs -Start to finish: {t_ready - t_start + t_generate_span:.3f} secs -""") +def main(): + args = get_args() + print_rank_n(f"Loading model {args.model_name}") + + (tokenizer, model), initialization_time = run_and_log_time( + [ + Execute( + AutoTokenizer.from_pretrained, + { + "pretrained_model_name_or_path": args.model_name, + } + ), + Execute( + AutoModelForCausalLM.from_pretrained, + { + "pretrained_model_name_or_path": args.model_name, + "device_map": "auto", + "max_memory": get_max_memory_per_gpu_dict(args.dtype, args.model_name), + "torch_dtype": args.dtype + } + ) + ] + ) + + print_rank_n( + f"*** Starting to generate {args.max_new_tokens} tokens with bs={args.batch_size}") + + input_sentences = get_dummy_batch(args.batch_size) + generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) + + print_rank_n(f"Generate args {generate_kwargs}") + + # warmup is a must if measuring speed as it's when all the optimizations are performed + # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs + _ = generate( + input_sentences, + model, + tokenizer, + generate_kwargs, + "cuda:0" + ) + + generated, generation_time = run_and_log_time( + Execute( + generate, + { + "inputs": input_sentences, + "model": model, + "tokenizer": tokenizer, + "generate_kwargs": generate_kwargs, + "input_device": "cuda:0" + } + ) + ) + for i, (o, _) in zip(input_sentences, generated): + print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") + + if (args.benchmark_cycles > 0): + print_rank_n(f"*** Running benchmark") + + torch.cuda.empty_cache() + gc.collect() + + # warm up + _ = generate( + input_sentences, + model, + tokenizer, + generate_kwargs, + "cuda:0" + ) + torch.cuda.synchronize() + + # benchmark + total_new_tokens_generated, benchmark_time = run_and_log_time( + Execute( + benchmark_generation, + { + "input_sentences": input_sentences, + "model": model, + "tokenizer": tokenizer, + "generate_kwargs": generate_kwargs, + "input_device": "cuda:0", + } + ) + ) + print_rank_n( + get_benchmark_results( + benchmark_time, + initialization_time, + generation_time, + total_new_tokens_generated, + args.batch_size + ) + ) + +if (__name__ == "__main__"): + main() diff --git a/scripts/inference/utils.py b/scripts/inference/utils.py new file mode 100644 index 000000000..7ea213c62 --- /dev/null +++ b/scripts/inference/utils.py @@ -0,0 +1,187 @@ +import argparse +import copy +import math +import time +from typing import Any, List, Tuple, Union + +import torch +import torch.distributed as dist +from transformers import AutoModelForCausalLM, AutoTokenizer + + +dummy_input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way" +] + + +class MaxTokensError(Exception): + def __init__(self, max_new_tokens: int, allowed_max_new_tokens: int) -> None: + super().__init__("max_new_tokens = {} > {} is not supported.".format( + max_new_tokens, allowed_max_new_tokens)) + + +class Execute: + def __init__(self, func: callable, kwargs: dict) -> None: + self.func = func + self.kwargs = kwargs + + def __call__(self) -> Any: + return self.func(**self.kwargs) + + +def get_argument_parser(): + parser = argparse.ArgumentParser() + + group = parser.add_argument_group(title="model") + group.add_argument("--model_name", type=str, + required=True, help="model to use") + group.add_argument("--dtype", type=str, required=True, + choices=["bf16", "fp16"], help="dtype for model") + group.add_argument("--batch_size", default=1, type=int, help="batch size") + + group = parser.add_argument_group(title="default values") + group.add_argument("--greedy", action="store_true") + group.add_argument("--top_k", type=int, default=0, help="default top_k") + group.add_argument("--top_p", type=float, default=0, help="default top_p") + group.add_argument("--temperature", type=float, + default=1, help="default temperature") + group.add_argument("--min_length", type=int, default=1, help="min length") + group.add_argument("--max_new_tokens", type=int, + default=100, help="max new tokens") + + return parser + + +def get_args(parser: argparse.ArgumentParser): + args = parser.parse_args() + args.dtype = get_torch_dtype(args.dtype) + return args + + +def run_rank_n(func: callable, + kwargs: dict, + barrier: bool = False, + rank: int = 0) -> Any: + if (dist.is_initialized()): + if (dist.get_rank() == rank): + output = func(**kwargs) + if (barrier): + dist.barrier() + return output + else: + if (barrier): + dist.barrier() + else: + return func(**kwargs) + + +def print_rank_n(*values, rank: int = 0) -> None: + if (dist.is_initialized()): + if (dist.get_rank() == rank): + print(*values) + else: + print(*values) + + +def get_torch_dtype(dtype_str: str) -> torch.dtype: + if (dtype_str == "bf16"): + return torch.bfloat16 + elif (dtype_str == "fp16"): + return torch.float16 + + +def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]: + if (input_sentences == None): + input_sentences = copy.deepcopy(dummy_input_sentences) + + if (batch_size > len(input_sentences)): + input_sentences *= math.ceil(batch_size / len(input_sentences)) + input_sentences = input_sentences[:batch_size] + + return input_sentences + + +def generate(inputs: List[str], + model: AutoModelForCausalLM, + tokenizer: AutoTokenizer, + generate_kwargs: dict, + input_device) -> Tuple[List[str], List[int]]: + """ returns a list of zipped outputs and number of new tokens """ + + input_tokens = tokenizer( + inputs, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(input_device) + + outputs = model.generate(**input_tokens, **generate_kwargs) + + input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_tokens_lengths = [x.shape[0] for x in outputs] + + total_new_tokens = [o-i for i, + o in zip(input_tokens_lengths, output_tokens_lengths)] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + return zip(outputs, total_new_tokens) + + +def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]: + """ + runs a list of Execute objects and returns a list of outputs and the time taken + """ + start_time = time.time() + + if (type(execs) == list): + results = [] + for e in execs: + results.append(e()) + else: + results = execs() + + time_elapsed = time.time() - start_time + return results, time_elapsed + + +def benchmark_generation(input_sentences, + model, + tokenizer, + generate_kwargs, + input_device, + cycles: int = 5): + total_new_tokens_generated = 0 + for _ in range(cycles): + generated = generate( + input_sentences, + model, + tokenizer, + generate_kwargs, + input_device + ) + total_new_tokens_generated += sum(new_tokens for _, + new_tokens in generated) + return total_new_tokens_generated + + +def get_benchmark_results(benchmark_time: float, + initialization_time: float, + generation_time: float, + total_new_tokens_generated: int, + batch_size: int) -> str: + throughput = total_new_tokens_generated / benchmark_time + return f""" +*** Performance stats: +Throughput (including tokenization) = {throughput:.2f} tokens/sec +Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token +Model loading time = {initialization_time:.2f} secs +Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} +Generation time per batch = {generation_time:.2f} secs +Model loading time + generation time per batch = {initialization_time + generation_time:.2f} secs +""" From 435af43ff7d1e87ecd10fb56e2b768af67c568fc Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Fri, 12 Aug 2022 22:12:12 +0530 Subject: [PATCH 02/32] HF accelerate model --- ...rence.py => bloom_accelerate_inference.py} | 138 +++++++++++------- scripts/inference/utils.py | 50 +------ 2 files changed, 95 insertions(+), 93 deletions(-) rename scripts/inference/{bloom-accelerate-inference.py => bloom_accelerate_inference.py} (52%) diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom_accelerate_inference.py similarity index 52% rename from scripts/inference/bloom-accelerate-inference.py rename to scripts/inference/bloom_accelerate_inference.py index 19c9237d9..6b58c01d5 100644 --- a/scripts/inference/bloom-accelerate-inference.py +++ b/scripts/inference/bloom_accelerate_inference.py @@ -1,4 +1,5 @@ import gc +from typing import List, Union import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer @@ -7,7 +8,6 @@ from utils import ( Execute, benchmark_generation, - generate, get_argument_parser, get_benchmark_results, get_dummy_batch, @@ -16,6 +16,59 @@ ) +class HFAccelerateModel: + def __init__(self, model_name: str, dtype: torch.dtype) -> None: + print_rank_n("Loading model...") + + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + + self.model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map="auto", + max_memory=get_max_memory_per_gpu_dict( + dtype, model_name), + torch_dtype=dtype + ) + + self.model.eval() + self.input_device = "cuda:0" + + print_rank_n("Model loaded") + + def generate(self, + text: Union[str, List[str]], + generate_kwargs: dict, + remove_input_from_output: bool = False) -> Union[str, List[str]]: + if (type(text) == str): + text = [text] + + input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) + + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(self.input_device) + + with torch.no_grad(): + output_tokens = self.model.generate( + **input_tokens, + **generate_kwargs + ) + + input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_token_lengths = [x.shape[0] for x in output_tokens] + generated_tokens = [ + o - i for i, o in zip(input_token_lengths, output_token_lengths)] + + if (remove_input_from_output): + output_tokens = [x[-i:] + for x, i in zip(output_tokens, generated_tokens)] + + output_text = self.tokenizer.batch_decode( + output_tokens, skip_special_tokens=True) + + return output_text, generated_tokens + + def get_args(): parser = get_argument_parser() @@ -50,80 +103,74 @@ def get_max_memory_per_gpu_dict(dtype, model_name): # from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing model_params = l*(12*h**2 + 13*h) + v*h + 4*h except: - print(f"The model {model_name} has a broken config file. Please notify the owner") + print_rank_n( + f"The model {model_name} has a broken config file. Please notify the owner") raise bytes = torch.finfo(dtype).bits / 8 param_memory_total_in_bytes = model_params * bytes # add 5% since weight sizes aren't the same and some GPU may need more memory - param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.05) - print(f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") + param_memory_per_gpu_in_bytes = int( + param_memory_total_in_bytes / n_gpus * 1.05) + print_rank_n( + f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") # check the real available memory # load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB) torch.ones(1).cuda() max_memory_per_gpu_in_bytes = torch.cuda.mem_get_info(0)[0] if max_memory_per_gpu_in_bytes < param_memory_per_gpu_in_bytes: - raise ValueError(f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") + raise ValueError( + f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} def main(): args = get_args() - print_rank_n(f"Loading model {args.model_name}") - (tokenizer, model), initialization_time = run_and_log_time( - [ - Execute( - AutoTokenizer.from_pretrained, - { - "pretrained_model_name_or_path": args.model_name, - } - ), - Execute( - AutoModelForCausalLM.from_pretrained, - { - "pretrained_model_name_or_path": args.model_name, - "device_map": "auto", - "max_memory": get_max_memory_per_gpu_dict(args.dtype, args.model_name), - "torch_dtype": args.dtype - } - ) - ] + model, initialization_time = run_and_log_time( + Execute( + HFAccelerateModel, + { + "model_name": args.model_name, + "dtype": args.dtype, + } + ) ) + if (args.generate_kwargs): + generate_kwargs = args.generate_kwargs + else: + generate_kwargs = { + "max_new_tokens": 100, + "do_sample": False + } + print_rank_n( - f"*** Starting to generate {args.max_new_tokens} tokens with bs={args.batch_size}") + f"*** Starting to generate {generate_kwargs['max_new_tokens']} tokens with bs={args.batch_size}") input_sentences = get_dummy_batch(args.batch_size) - generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) print_rank_n(f"Generate args {generate_kwargs}") # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - _ = generate( + model.generate( input_sentences, - model, - tokenizer, - generate_kwargs, - "cuda:0" + generate_kwargs ) - generated, generation_time = run_and_log_time( + (output_text, num_generated_tokens), generation_time = run_and_log_time( Execute( - generate, + model.generate, { - "inputs": input_sentences, - "model": model, - "tokenizer": tokenizer, - "generate_kwargs": generate_kwargs, - "input_device": "cuda:0" + "text": input_sentences, + "generate_kwargs": generate_kwargs } ) ) - for i, (o, _) in zip(input_sentences, generated): + for i, (o, _) in zip(input_sentences, zip(output_text, num_generated_tokens)): print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") if (args.benchmark_cycles > 0): @@ -133,13 +180,7 @@ def main(): gc.collect() # warm up - _ = generate( - input_sentences, - model, - tokenizer, - generate_kwargs, - "cuda:0" - ) + model.generate(input_sentences, generate_kwargs) torch.cuda.synchronize() # benchmark @@ -149,9 +190,7 @@ def main(): { "input_sentences": input_sentences, "model": model, - "tokenizer": tokenizer, - "generate_kwargs": generate_kwargs, - "input_device": "cuda:0", + "generate_kwargs": generate_kwargs } ) ) @@ -165,5 +204,6 @@ def main(): ) ) + if (__name__ == "__main__"): main() diff --git a/scripts/inference/utils.py b/scripts/inference/utils.py index 7ea213c62..f6b7e0345 100644 --- a/scripts/inference/utils.py +++ b/scripts/inference/utils.py @@ -45,16 +45,8 @@ def get_argument_parser(): group.add_argument("--dtype", type=str, required=True, choices=["bf16", "fp16"], help="dtype for model") group.add_argument("--batch_size", default=1, type=int, help="batch size") - - group = parser.add_argument_group(title="default values") - group.add_argument("--greedy", action="store_true") - group.add_argument("--top_k", type=int, default=0, help="default top_k") - group.add_argument("--top_p", type=float, default=0, help="default top_p") - group.add_argument("--temperature", type=float, - default=1, help="default temperature") - group.add_argument("--min_length", type=int, default=1, help="min length") - group.add_argument("--max_new_tokens", type=int, - default=100, help="max new tokens") + group.add_argument("--generate_kwargs", type=dict, default={}, + help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters") return parser @@ -108,31 +100,6 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[ return input_sentences -def generate(inputs: List[str], - model: AutoModelForCausalLM, - tokenizer: AutoTokenizer, - generate_kwargs: dict, - input_device) -> Tuple[List[str], List[int]]: - """ returns a list of zipped outputs and number of new tokens """ - - input_tokens = tokenizer( - inputs, return_tensors="pt", padding=True) - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(input_device) - - outputs = model.generate(**input_tokens, **generate_kwargs) - - input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] - output_tokens_lengths = [x.shape[0] for x in outputs] - - total_new_tokens = [o-i for i, - o in zip(input_tokens_lengths, output_tokens_lengths)] - outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) - - return zip(outputs, total_new_tokens) - - def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]: """ runs a list of Execute objects and returns a list of outputs and the time taken @@ -152,21 +119,16 @@ def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], f def benchmark_generation(input_sentences, model, - tokenizer, generate_kwargs, - input_device, cycles: int = 5): total_new_tokens_generated = 0 for _ in range(cycles): - generated = generate( + _, num_generated_tokens = model.generate( input_sentences, - model, - tokenizer, - generate_kwargs, - input_device + generate_kwargs ) - total_new_tokens_generated += sum(new_tokens for _, - new_tokens in generated) + total_new_tokens_generated += sum( + new_tokens for new_tokens in num_generated_tokens) return total_new_tokens_generated From d1676fccce7fd9da1d1bba99d03f802f94b7b273 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 13 Aug 2022 14:39:36 +0530 Subject: [PATCH 03/32] refactor accelerate --- .../inference/bloom_accelerate_inference.py | 104 ++---------------- scripts/inference/utils.py | 86 ++++++++++++++- 2 files changed, 95 insertions(+), 95 deletions(-) diff --git a/scripts/inference/bloom_accelerate_inference.py b/scripts/inference/bloom_accelerate_inference.py index 6b58c01d5..8fc5a6120 100644 --- a/scripts/inference/bloom_accelerate_inference.py +++ b/scripts/inference/bloom_accelerate_inference.py @@ -1,4 +1,4 @@ -import gc +from argparse import Namespace from typing import List, Union import torch @@ -6,28 +6,25 @@ import utils from utils import ( - Execute, - benchmark_generation, + Model, + benchmark_end_to_end, get_argument_parser, - get_benchmark_results, - get_dummy_batch, - print_rank_n, - run_and_log_time, + print_rank_n ) -class HFAccelerateModel: - def __init__(self, model_name: str, dtype: torch.dtype) -> None: +class HFAccelerateModel(Model): + def __init__(self, args: Namespace) -> None: print_rank_n("Loading model...") - self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) self.model = AutoModelForCausalLM.from_pretrained( - model_name, + args.model_name, device_map="auto", max_memory=get_max_memory_per_gpu_dict( - dtype, model_name), - torch_dtype=dtype + args.dtype, args.model_name), + torch_dtype=args.dtype ) self.model.eval() @@ -126,84 +123,5 @@ def get_max_memory_per_gpu_dict(dtype, model_name): return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} -def main(): - args = get_args() - - model, initialization_time = run_and_log_time( - Execute( - HFAccelerateModel, - { - "model_name": args.model_name, - "dtype": args.dtype, - } - ) - ) - - if (args.generate_kwargs): - generate_kwargs = args.generate_kwargs - else: - generate_kwargs = { - "max_new_tokens": 100, - "do_sample": False - } - - print_rank_n( - f"*** Starting to generate {generate_kwargs['max_new_tokens']} tokens with bs={args.batch_size}") - - input_sentences = get_dummy_batch(args.batch_size) - - print_rank_n(f"Generate args {generate_kwargs}") - - # warmup is a must if measuring speed as it's when all the optimizations are performed - # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - model.generate( - input_sentences, - generate_kwargs - ) - - (output_text, num_generated_tokens), generation_time = run_and_log_time( - Execute( - model.generate, - { - "text": input_sentences, - "generate_kwargs": generate_kwargs - } - ) - ) - for i, (o, _) in zip(input_sentences, zip(output_text, num_generated_tokens)): - print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") - - if (args.benchmark_cycles > 0): - print_rank_n(f"*** Running benchmark") - - torch.cuda.empty_cache() - gc.collect() - - # warm up - model.generate(input_sentences, generate_kwargs) - torch.cuda.synchronize() - - # benchmark - total_new_tokens_generated, benchmark_time = run_and_log_time( - Execute( - benchmark_generation, - { - "input_sentences": input_sentences, - "model": model, - "generate_kwargs": generate_kwargs - } - ) - ) - print_rank_n( - get_benchmark_results( - benchmark_time, - initialization_time, - generation_time, - total_new_tokens_generated, - args.batch_size - ) - ) - - if (__name__ == "__main__"): - main() + benchmark_end_to_end(get_args(), HFAccelerateModel) diff --git a/scripts/inference/utils.py b/scripts/inference/utils.py index f6b7e0345..777f1a3ba 100644 --- a/scripts/inference/utils.py +++ b/scripts/inference/utils.py @@ -1,12 +1,12 @@ import argparse import copy +import gc import math import time -from typing import Any, List, Tuple, Union +from typing import Any, List, Union import torch import torch.distributed as dist -from transformers import AutoModelForCausalLM, AutoTokenizer dummy_input_sentences = [ @@ -36,6 +36,17 @@ def __call__(self) -> Any: return self.func(**self.kwargs) +class Model: + def __init__(self, args: argparse.Namespace) -> None: + raise NotImplementedError("This is a dummy class") + + def generate(self, + text: Union[str, List[str]], + generate_kwargs: dict, + remove_input_from_output: bool = False) -> Union[str, List[str]]: + raise NotImplementedError("This is a dummy class") + + def get_argument_parser(): parser = argparse.ArgumentParser() @@ -147,3 +158,74 @@ def get_benchmark_results(benchmark_time: float, Generation time per batch = {generation_time:.2f} secs Model loading time + generation time per batch = {initialization_time + generation_time:.2f} secs """ + + +def benchmark_end_to_end(args: argparse.Namespace, model_class: Model) -> None: + model, initialization_time = run_and_log_time( + Execute(model_class, {"args": args}) + ) + + if (args.generate_kwargs): + generate_kwargs = args.generate_kwargs + else: + generate_kwargs = { + "max_new_tokens": 100, + "do_sample": False + } + + print_rank_n( + f"*** Starting to generate {generate_kwargs['max_new_tokens']} tokens with bs={args.batch_size}") + + input_sentences = get_dummy_batch(args.batch_size) + + print_rank_n(f"Generate args {generate_kwargs}") + + # warmup is a must if measuring speed as it's when all the optimizations are performed + # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs + model.generate( + input_sentences, + generate_kwargs + ) + + (output_text, num_generated_tokens), generation_time = run_and_log_time( + Execute( + model.generate, + { + "text": input_sentences, + "generate_kwargs": generate_kwargs + } + ) + ) + for i, (o, _) in zip(input_sentences, zip(output_text, num_generated_tokens)): + print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") + + if (args.benchmark_cycles > 0): + print_rank_n(f"*** Running benchmark") + + torch.cuda.empty_cache() + gc.collect() + + # warm up + model.generate(input_sentences, generate_kwargs) + torch.cuda.synchronize() + + # benchmark + total_new_tokens_generated, benchmark_time = run_and_log_time( + Execute( + benchmark_generation, + { + "input_sentences": input_sentences, + "model": model, + "generate_kwargs": generate_kwargs + } + ) + ) + print_rank_n( + get_benchmark_results( + benchmark_time, + initialization_time, + generation_time, + total_new_tokens_generated, + args.batch_size + ) + ) From eef490c906f82dffda33588fe75d42e72cc86d7a Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 13 Aug 2022 18:55:47 +0530 Subject: [PATCH 04/32] refactor DS inference --- scripts/inference/bloom-ds-inference.py | 299 ------------------------ scripts/inference/bloom_ds_inference.py | 237 +++++++++++++++++++ 2 files changed, 237 insertions(+), 299 deletions(-) delete mode 100644 scripts/inference/bloom-ds-inference.py create mode 100644 scripts/inference/bloom_ds_inference.py diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py deleted file mode 100644 index c21dfeb96..000000000 --- a/scripts/inference/bloom-ds-inference.py +++ /dev/null @@ -1,299 +0,0 @@ -# usage: -# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom -# -# to run benchmarks: -# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark -# - - -# This is going to improve, but at the moment, the process is a bit cumbersome - we first use -# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, -# 2. free the allocated storage -# 3. start Deepspeed-Inference and only now load the checkpoint -# 4. run generate -# Done. -# - - -from argparse import ArgumentParser -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig -from transformers.deepspeed import HfDeepSpeedConfig -from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock -import deepspeed -import gc -import glob -import io -import json -import math -import os -import sys -import time -import torch -import torch.distributed as dist - -t_start = time.time() - -num_tokens = 100 - -parser = ArgumentParser() - -parser.add_argument("--name", required=True, type=str, help="model_name") -parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") -parser.add_argument("--batch_size", default=1, type=int, help="batch size") -parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") -args = parser.parse_args() - -local_rank = int(os.getenv('LOCAL_RANK', '0')) -world_size = int(os.getenv('WORLD_SIZE', '1')) - -deepspeed.init_distributed('nccl') -rank = dist.get_rank() - - -### Model loading and instantiating on GPUs - -def get_checkpoint_files(pretrained_model_name_or_path): - # XXX: I just hacked this one together to automatically handle the fetching of the model file or - # shards into cache and returning the cached entries - note that I removed most arguments - - from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url, is_offline_mode - from transformers.utils.hub import EntryNotFoundError - from transformers.modeling_utils import get_checkpoint_shard_files - - cache_dir = None - is_sharded = False - - # XXX: preparation for revision branches if needed - revision = None - #revision = "sharded" - - # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with - # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')" - if is_offline_mode(): - print("Offline mode: forcing local_files_only=True") - local_files_only = True - else: - local_files_only = False - - filename = WEIGHTS_NAME - archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision) - - try: - resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) - return [resolved_archive_file] - - except (EntryNotFoundError, FileNotFoundError): - if filename == WEIGHTS_NAME: - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - archive_file = hf_bucket_url( - pretrained_model_name_or_path, - filename=WEIGHTS_INDEX_NAME, - revision=revision, - ) - resolved_archive_file = cached_path( - archive_file, - cache_dir=cache_dir, - local_files_only=local_files_only, - ) - is_sharded = True - - if is_sharded: - # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. - resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( - pretrained_model_name_or_path, - resolved_archive_file, - cache_dir=cache_dir, - revision=revision - ) - - return resolved_archive_file - -model_name = args.name - -#print(get_checkpoint_files(model_name)) - -if rank == 0: - print(f"*** Loading the model {model_name}") - -tokenizer = AutoTokenizer.from_pretrained(model_name) -config = AutoConfig.from_pretrained(model_name) - -# XXX: can't automatically derive dtype via config's `from_pretrained` -#dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 - - -# use one of these args to `init_inference` -# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work -# 2. replace_with_kernel_inject is the faster one (fast fused kernels) -kernel_inject = True -#kernel_inject = False - -if kernel_inject: - # XXX: for now ds-inference only works with fp16 - dtype = torch.float16 -else: - dtype = torch.bfloat16 - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) - -# Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load -with deepspeed.OnDevice(dtype=dtype, device='meta'): - model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16) - -if args.benchmark: - deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) - -model = model.eval() - - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True) - -### Deepspeed-Inference Loading - -checkpoints_json = "checkpoints.json" -def write_checkponts_json(): - - with io.open(checkpoints_json, 'w', encoding='utf-8') as f: - - #checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom" - #checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") - checkpoint_files = get_checkpoint_files(model_name) - - #print("Checkpoint files:", checkpoint_files) - - data = { - "type": "BLOOM-176B", - "checkpoints": checkpoint_files, - "version": 1.0 - } - json.dump(data, f) - -if rank == 0: - write_checkponts_json() -dist.barrier() - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - deepspeed.runtime.utils.see_memory_usage('pre-ds-inference-init', force=True) - -if kernel_inject: - kwargs = dict(replace_with_kernel_inject=True) -else: - kwargs = dict(injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}) - -#checkpoints_json=None -model = deepspeed.init_inference(model, - mp_size=world_size, - dtype=torch.half, - checkpoint=checkpoints_json, - **kwargs, - ) - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True) - - -model = model.module - -if args.benchmark: - t_ready = time.time() - - -### Generate - -if rank == 0: - print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") - -input_sentences = [ - "DeepSpeed is a machine learning framework", - "He is working on", - "He has a", - "He got all", - "Everyone is happy and I can", - "The new movie that got Oscar this year", - "In the far far distance from our galaxy,", - "Peace is the only way" -] - -if args.batch_size > len(input_sentences): - # dynamically extend to support larger bs by repetition - input_sentences *= math.ceil(args.batch_size / len(input_sentences)) - -generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) - -if rank == 0: - print(f"Generate args {generate_kwargs}") -inputs = input_sentences[:args.batch_size] -def generate(): - """ returns a list of zipped inputs, outputs and number of new tokens """ - - input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) - - outputs = model.generate(**input_tokens, **generate_kwargs) - - input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] - output_tokens_lengths = [x.shape[0] for x in outputs] - - total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] - outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) - - return zip(inputs, outputs, total_new_tokens) - - -# warmup is a must if measuring speed as it's when all the optimizations are performed -# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs -_ = generate() - -t_generate_start = time.time() -generated = generate() -t_generate_span = time.time() - t_generate_start -if rank == 0: - for i,o,_ in generated: - print(f"{'-'*60}\nin={i}\nout={o}\n") - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) - -### Benchmark - -# benchmark it! -if args.benchmark: - if rank == 0: - print(f"*** Running benchmark") - - # warm up - for i in range(1): - _ = generate() - torch.cuda.synchronize() - - # benchmark - t0 = time.time() - cycles = 5 - total_new_tokens_generated = 0 - for i in range(cycles): - generated = generate() - total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) - torch.cuda.synchronize() - if rank == 0: - througput = (time.time() - t0)/(total_new_tokens_generated) - print(f""" -*** Performance stats: -Throughput per token including tokenize: {througput*1000:.2f} msecs -Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs -Start to finish: {t_ready - t_start + t_generate_span:.3f} secs -""") diff --git a/scripts/inference/bloom_ds_inference.py b/scripts/inference/bloom_ds_inference.py new file mode 100644 index 000000000..a5671fec6 --- /dev/null +++ b/scripts/inference/bloom_ds_inference.py @@ -0,0 +1,237 @@ +import io +import json +import os +import shutil +from argparse import Namespace +from typing import List, Union + +import deepspeed +import torch +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.modeling_utils import get_checkpoint_shard_files +from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_offline_mode +from transformers.utils.hub import EntryNotFoundError + +import utils +from utils import Model, benchmark_end_to_end, get_argument_parser, print_rank_n, run_rank_n + + +class DSInferenceModel(Model): + def __init__(self, args: Namespace) -> None: + print_rank_n("Loading model...") + + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + use_hf_checkpoints = not args.save_mp_checkpoint_path + cache_ds_checkpoints = args.save_mp_checkpoint_path and not os.path.isdir( + args.save_mp_checkpoint_path) + + world_size = int(os.getenv("WORLD_SIZE", "1")) + + # Load model + with deepspeed.OnDevice(dtype=args.dtype, device="meta"): + self.model = AutoModelForCausalLM.from_config( + AutoConfig.from_pretrained(args.model_name), + torch_dtype=torch.bfloat16 + ) + self.model = self.model.eval() + + # Write checkpoints.json + tmp_directory = "tmp" + run_rank_n( + os.makedirs, + { + "name": tmp_directory, + "exist_ok": True + } + ) + checkpoints_json = os.path.join(tmp_directory, "checkpoints.json") + run_rank_n( + write_checkponts_json, + { + "checkpoints_json": checkpoints_json, + "model_name": args.model_name + }, + barrier=True + ) + + # init inference + if (use_hf_checkpoints): + print_rank_n("Loading HF checkpoints") + + if (args.dtype == torch.float16): + self.model = deepspeed.init_inference( + self.model, + mp_size=world_size, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True + ) + else: + raise NotImplementedError("bfloat16 is not yet supported") + elif (cache_ds_checkpoints): + print_rank_n("Caching DS checkpoints and loading model") + + run_rank_n( + os.makedirs, + { + "name": args.save_mp_checkpoint_path, + "exist_ok": True + }, + barrier=True + ) + + if (args.dtype == torch.float16): + self.model = deepspeed.init_inference( + self.model, + mp_size=world_size, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True, + save_mp_checkpoint_path=args.save_mp_checkpoint_path + ) + else: + raise NotImplementedError("bfloat16 is not yet supported") + else: + print_rank_n("Loading DS cached checkpoints") + + checkpoints_json = os.path.join( + args.save_mp_checkpoint_path, "BLOOM-176B_ds-inference_config.json") + + if (args.dtype == torch.float16): + self.model = deepspeed.init_inference( + self.model, + mp_size=world_size, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True + ) + else: + raise NotImplementedError("bfloat16 is not yet supported") + + run_rank_n(shutil.rmtree, {"path": tmp_directory}) + + self.model = self.model.module + self.input_device = torch.cuda.current_device() + + print_rank_n("Model loaded") + + def generate(self, + text: Union[str, List[str]], + generate_kwargs: dict, + remove_input_from_output: bool = False) -> Union[str, List[str]]: + if (type(text) == str): + text = [text] + + input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) + + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(self.input_device) + + with torch.no_grad(): + output_tokens = self.model.generate( + **input_tokens, + **generate_kwargs + ) + + input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_token_lengths = [x.shape[0] for x in output_tokens] + generated_tokens = [ + o - i for i, o in zip(input_token_lengths, output_token_lengths)] + + if (remove_input_from_output): + output_tokens = [x[-i:] + for x, i in zip(output_tokens, generated_tokens)] + + output_text = self.tokenizer.batch_decode( + output_tokens, skip_special_tokens=True) + + return output_text, generated_tokens + + +def get_args(): + parser = get_argument_parser() + + group = parser.add_argument_group(title="launch config") + group.add_argument("--benchmark_cycles", type=int, + default=0, help="additionally run benchmark") + group.add_argument("--local_rank", required=False, + type=int, help="used by dist launchers") + group.add_argument("--save_mp_checkpoint_path", required=False, + type=str, help="MP checkpoints path") + + args = utils.get_args(parser) + + return args + + +def get_checkpoint_files(pretrained_model_name_or_path): + # XXX: I just hacked this one together to automatically handle the fetching of the model file or + # shards into cache and returning the cached entries - note that I removed most arguments + cache_dir = None + is_sharded = False + + # XXX: preparation for revision branches if needed + revision = None + #revision = "sharded" + + # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with + # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')" + if (is_offline_mode()): + print("Offline mode: forcing local_files_only=True") + local_files_only = True + else: + local_files_only = False + + filename = WEIGHTS_NAME + archive_file = hf_bucket_url( + pretrained_model_name_or_path, filename=filename, revision=revision) + + try: + resolved_archive_file = cached_path( + archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) + return [resolved_archive_file] + except (EntryNotFoundError, FileNotFoundError): + if filename == WEIGHTS_NAME: + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + archive_file = hf_bucket_url( + pretrained_model_name_or_path, + filename=WEIGHTS_INDEX_NAME, + revision=revision, + ) + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + local_files_only=local_files_only, + ) + is_sharded = True + + if (is_sharded): + # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + cache_dir=cache_dir, + revision=revision + ) + + return resolved_archive_file + + +def write_checkponts_json(checkpoints_json: str, model_name: str) -> None: + with io.open(checkpoints_json, 'w', encoding='utf-8') as f: + #checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom" + #checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") + checkpoint_files = get_checkpoint_files(model_name) + data = { + "type": "BLOOM-176B", + "checkpoints": checkpoint_files, + "version": 1.0 + } + json.dump(data, f) + + +if (__name__ == "__main__"): + deepspeed.init_distributed('nccl') + benchmark_end_to_end(get_args(), DSInferenceModel) From 25d0c7042f0daa7f74f23bf12adfd6f444429941 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 14 Aug 2022 03:54:55 +0530 Subject: [PATCH 05/32] refactor DS ZeRO --- scripts/inference/bloom-ds-zero-inference.py | 211 ------------------- scripts/inference/bloom_ds_zero_inference.py | 162 ++++++++++++++ scripts/inference/utils.py | 14 +- 3 files changed, 174 insertions(+), 213 deletions(-) delete mode 100644 scripts/inference/bloom-ds-zero-inference.py create mode 100644 scripts/inference/bloom_ds_zero_inference.py diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py deleted file mode 100644 index 043b4967f..000000000 --- a/scripts/inference/bloom-ds-zero-inference.py +++ /dev/null @@ -1,211 +0,0 @@ -# usage: -# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom -# -# to run benchmarks: -# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark -# - - -# This is going to improve, but at the moment, the process is a bit cumbersome - we first use -# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, -# 2. free the allocated storage -# 3. start Deepspeed-Inference and only now load the checkpoint -# 4. run generate -# Done. -# - - -from argparse import ArgumentParser -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig -from transformers.deepspeed import HfDeepSpeedConfig -from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock -import deepspeed -import gc -import glob -import io -import json -import math -import os -import sys -import time -import torch -import torch.distributed as dist - -t_start = time.time() - -num_tokens = 100 - -parser = ArgumentParser() - -parser.add_argument("--name", required=True, type=str, help="model_name") -parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") -parser.add_argument("--batch_size", default=1, type=int, help="batch size") -parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") -parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload") -args = parser.parse_args() - -local_rank = int(os.getenv('LOCAL_RANK', '0')) -world_size = int(os.getenv('WORLD_SIZE', '1')) - - -### Model loading and instantiating on GPU (via ZeRO) - -model_name = args.name - -if local_rank == 0: - print(f"*** Loading the model {model_name}") - -tokenizer = AutoTokenizer.from_pretrained(model_name) -config = AutoConfig.from_pretrained(model_name) - -# XXX: can't automatically derive dtype via config's `from_pretrained` -dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 - -model_hidden_size = config.hidden_size -train_batch_size = 1 * world_size - -ds_config = { - "fp16": { - "enabled": dtype == torch.float16, - }, - "bf16": { - "enabled": dtype == torch.bfloat16, - }, - "zero_optimization": { - "stage": 3, - "overlap_comm": True, - "contiguous_gradients": True, - "reduce_bucket_size": model_hidden_size * model_hidden_size, - "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, - "stage3_param_persistence_threshold": 0 - }, - "steps_per_print": 2000, - "train_batch_size": train_batch_size, - "train_micro_batch_size_per_gpu": 1, - "wall_clock_breakdown": False -} - -if args.cpu_offload: - ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True) - -dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) - -model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) - -if args.benchmark: - deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) - -model = model.eval() - -rank = dist.get_rank() - -if rank == 0: - print(ds_config) - -ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] -ds_engine.module.eval() -model = ds_engine.module - -if args.benchmark: - t_ready = time.time() - - -### Generate - -if rank == 0: - print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") - -input_sentences = [ - "DeepSpeed is a machine learning framework", - "He is working on", - "He has a", - "He got all", - "Everyone is happy and I can", - "The new movie that got Oscar this year", - "In the far far distance from our galaxy,", - "Peace is the only way" -] - -if args.batch_size > len(input_sentences): - # dynamically extend to support larger bs by repetition - input_sentences *= math.ceil(args.batch_size / len(input_sentences)) - -generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) - -if rank == 0: - print(f"Generate args {generate_kwargs}") -inputs = input_sentences[:args.batch_size] -def generate(): - """ returns a list of zipped inputs, outputs and number of new tokens """ - - input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) - - outputs = model.generate(**input_tokens, **generate_kwargs) - - input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] - output_tokens_lengths = [x.shape[0] for x in outputs] - - total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] - outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) - - return zip(inputs, outputs, total_new_tokens) - -# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size - -# warmup is a must if measuring speed as it's when all the optimizations are performed -# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs -_ = generate() - -t_generate_start = time.time() -pairs = generate() -t_generate_span = time.time() - t_generate_start -if rank == 0: - for i,o,_ in pairs: - print(f"{'-'*60}\nin={i}\nout={o}\n") - - -if args.benchmark: - torch.cuda.empty_cache() - gc.collect() - deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) - -### Benchmark - -if args.benchmark: - if rank == 0: - print(f"*** Running benchmark") - - # warm up - for i in range(1): - _ = generate() - torch.cuda.synchronize() - - # benchmark - t0 = time.time() - cycles = 5 - total_new_tokens_generated = 0 - for i in range(cycles): - generated = generate() - total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) - - torch.cuda.synchronize() - if rank == 0: - # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs) - total_new_tokens_generated *= world_size - througput = (time.time() - t0)/(total_new_tokens_generated) - print(f""" -*** Performance stats: -Throughput per token including tokenize: {througput*1000:.2f} msecs -Start to ready to generate: {t_ready - t_start:.3f} secs -Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs -Start to finish: {t_ready - t_start + t_generate_span:.3f} secs -""") - diff --git a/scripts/inference/bloom_ds_zero_inference.py b/scripts/inference/bloom_ds_zero_inference.py new file mode 100644 index 000000000..38e8e7a89 --- /dev/null +++ b/scripts/inference/bloom_ds_zero_inference.py @@ -0,0 +1,162 @@ +import os +from argparse import Namespace +from typing import List, Union + +import deepspeed +import torch +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.deepspeed import HfDeepSpeedConfig + +import utils +from utils import Model, benchmark_end_to_end, get_argument_parser, print_rank_n + + +class DSZeroModel(Model): + def __init__(self, args: Namespace) -> None: + if (args.local_rank == 0): + print("Loading model...") + + config = AutoConfig.from_pretrained(args.model_name) + + world_size = int(os.getenv('WORLD_SIZE', '1')) + train_batch_size = 1 * world_size + + ds_config = { + "fp16": { + "enabled": args.dtype == torch.float16, + }, + "bf16": { + "enabled": args.dtype == torch.bfloat16, + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": config.hidden_size * config.hidden_size, + "stage3_prefetch_bucket_size": 0.9 * config.hidden_size * config.hidden_size, + "stage3_param_persistence_threshold": 0 + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False + } + + if (args.cpu_offload): + ds_config["zero_optimization"]["offload_param"] = { + "device": "cpu", + "pin_memory": True + } + + # this tells from_pretrained to instantiate directly on gpus + dschf = HfDeepSpeedConfig(ds_config) + + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + self.model = AutoModelForCausalLM.from_pretrained( + args.model_name, torch_dtype=args.dtype) + self.model = self.model.eval() + self.model = deepspeed.initialize( + model=self.model, config_params=ds_config)[0] + self.model.module.eval() + self.model = self.model.module + + self.input_device = torch.cuda.current_device() + + def generate(self, + text: Union[str, List[str]], + generate_kwargs: dict, + remove_input_from_output: bool = False) -> Union[str, List[str]]: + if (type(text) == str): + text = [text] + + input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) + + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(self.input_device) + + with torch.no_grad(): + output_tokens = self.model.generate( + **input_tokens, + **generate_kwargs + ) + + input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_token_lengths = [x.shape[0] for x in output_tokens] + generated_tokens = [ + o - i for i, o in zip(input_token_lengths, output_token_lengths)] + + if (remove_input_from_output): + output_tokens = [x[-i:] + for x, i in zip(output_tokens, generated_tokens)] + + output_text = self.tokenizer.batch_decode( + output_tokens, skip_special_tokens=True) + + return output_text, generated_tokens + + +def get_args(): + parser = get_argument_parser() + + group = parser.add_argument_group(title="launch config") + group.add_argument("--benchmark_cycles", type=int, + default=0, help="additionally run benchmark") + group.add_argument("--local_rank", required=False, + type=int, help="used by dist launchers") + group.add_argument("--cpu_offload", action="store_true", + help="whether to activate CPU offload") + + args = utils.get_args(parser) + + return args + + +def get_max_memory_per_gpu_dict(dtype, model_name): + """ try to generate the memory map based on what we know about the model and the available hardware """ + + # figure out the memory map - the minimum per gpu required to load the model + n_gpus = torch.cuda.device_count() + + if model_name == "bigscience/bloom" and n_gpus == 8 and torch.cuda.get_device_properties(0).total_memory > 79*2**30: + # hand crafted optimized memory map for 8x80 setup over BLOOM + # this works with bs=40 + return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} + + try: + # model_params calculation, as we don't have a model yet to do: + #model_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + + config = AutoConfig.from_pretrained(model_name) + h = config.n_embed + l = config.n_layer + v = config.vocab_size + # from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing + model_params = l*(12*h**2 + 13*h) + v*h + 4*h + except: + print_rank_n( + f"The model {model_name} has a broken config file. Please notify the owner") + raise + + bytes = torch.finfo(dtype).bits / 8 + param_memory_total_in_bytes = model_params * bytes + # add 5% since weight sizes aren't the same and some GPU may need more memory + param_memory_per_gpu_in_bytes = int( + param_memory_total_in_bytes / n_gpus * 1.05) + print_rank_n( + f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") + + # check the real available memory + # load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB) + torch.ones(1).cuda() + max_memory_per_gpu_in_bytes = torch.cuda.mem_get_info(0)[0] + if max_memory_per_gpu_in_bytes < param_memory_per_gpu_in_bytes: + raise ValueError( + f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") + + return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} + + +if (__name__ == "__main__"): + benchmark_end_to_end(get_args(), DSZeroModel, zero_activated=True) diff --git a/scripts/inference/utils.py b/scripts/inference/utils.py index 777f1a3ba..0ddc4497d 100644 --- a/scripts/inference/utils.py +++ b/scripts/inference/utils.py @@ -2,6 +2,7 @@ import copy import gc import math +import os import time from typing import Any, List, Union @@ -160,7 +161,9 @@ def get_benchmark_results(benchmark_time: float, """ -def benchmark_end_to_end(args: argparse.Namespace, model_class: Model) -> None: +def benchmark_end_to_end(args: argparse.Namespace, + model_class: Model, + zero_activated: bool = False) -> None: model, initialization_time = run_and_log_time( Execute(model_class, {"args": args}) ) @@ -216,10 +219,17 @@ def benchmark_end_to_end(args: argparse.Namespace, model_class: Model) -> None: { "input_sentences": input_sentences, "model": model, - "generate_kwargs": generate_kwargs + "generate_kwargs": generate_kwargs, + "cycles": args.benchmark_cycles } ) ) + + # with ZeRO every GPU is generating batch_size * sequence_length tokens + if (zero_activated): + world_size = int(os.getenv('WORLD_SIZE', '1')) + total_new_tokens_generated *= world_size + print_rank_n( get_benchmark_results( benchmark_time, From 7be1410838fd1a9cbdf084f3279622b9b71daffc Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 14 Aug 2022 04:48:59 +0530 Subject: [PATCH 06/32] make inference library --- {scripts/inference => inference}/README.md | 0 inference/benchmark.py | 60 +++++++++++++++++++ inference/ds_inference/__init__.py | 1 + .../ds_inference/model.py | 24 +------- inference/ds_zero/__init__.py | 1 + .../ds_zero/model.py | 25 +------- inference/hf_accelerate/__init__.py | 1 + .../hf_accelerate/model.py | 24 +------- inference/utils/__init__.py | 1 + .../inference => inference/utils}/utils.py | 0 inference/values.py | 3 + 11 files changed, 71 insertions(+), 69 deletions(-) rename {scripts/inference => inference}/README.md (100%) create mode 100644 inference/benchmark.py create mode 100644 inference/ds_inference/__init__.py rename scripts/inference/bloom_ds_inference.py => inference/ds_inference/model.py (90%) create mode 100644 inference/ds_zero/__init__.py rename scripts/inference/bloom_ds_zero_inference.py => inference/ds_zero/model.py (87%) create mode 100644 inference/hf_accelerate/__init__.py rename scripts/inference/bloom_accelerate_inference.py => inference/hf_accelerate/model.py (89%) create mode 100644 inference/utils/__init__.py rename {scripts/inference => inference/utils}/utils.py (100%) create mode 100644 inference/values.py diff --git a/scripts/inference/README.md b/inference/README.md similarity index 100% rename from scripts/inference/README.md rename to inference/README.md diff --git a/inference/benchmark.py b/inference/benchmark.py new file mode 100644 index 000000000..ece6329b1 --- /dev/null +++ b/inference/benchmark.py @@ -0,0 +1,60 @@ +import deepspeed + +import utils +import values +from ds_inference import DSInferenceModel +from ds_zero import DSZeROModel +from hf_accelerate import HFAccelerateModel +from utils import benchmark_end_to_end, get_argument_parser + + +def get_args(): + parser = get_argument_parser() + + group = parser.add_argument_group(title="launch config") + group.add_argument( + "--deployment_framework", + type=str, + choices=[ + values.HF_ACCELERATE, + values.DS_INFERENCE, + values.DS_ZERO + ], + default=values.HF_ACCELERATE + ) + group.add_argument("--benchmark_cycles", type=int, + default=0, help="additionally run benchmark") + group.add_argument("--local_rank", required=False, + type=int, help="used by dist launchers") + group.add_argument("--save_mp_checkpoint_path", required=False, + type=str, help="MP checkpoints path for DS inference") + group.add_argument("--cpu_offload", action="store_true", + help="whether to activate CPU offload for DS ZeRO") + + args = utils.get_args(parser) + + launched_with_deepspeed = args.deployment_framework in [ + values.DS_INFERENCE, values.DS_ZERO] + + if (not launched_with_deepspeed): + assert args.local_rank == None, "local_rank must be None if not launched with DeepSpeed" + + if (args.save_mp_checkpoint_path): + assert args.deployment_framework == values.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" + + if (args.cpu_offload): + assert args.deployment_framework == values.DS_ZERO, "cpu_offload only works with DS_ZeRO" + + return args + + +if (__name__ == "__main__"): + args = get_args() + + if (args.deployment_framework == values.HF_ACCELERATE): + benchmark_end_to_end(get_args(), HFAccelerateModel) + elif (args.deployment_framework == values.DS_INFERENCE): + deepspeed.init_distributed('nccl') + benchmark_end_to_end(get_args(), DSInferenceModel) + else: + benchmark_end_to_end(get_args(), DSZeROModel, zero_activated=True) diff --git a/inference/ds_inference/__init__.py b/inference/ds_inference/__init__.py new file mode 100644 index 000000000..0f1181842 --- /dev/null +++ b/inference/ds_inference/__init__.py @@ -0,0 +1 @@ +from .model import DSInferenceModel diff --git a/scripts/inference/bloom_ds_inference.py b/inference/ds_inference/model.py similarity index 90% rename from scripts/inference/bloom_ds_inference.py rename to inference/ds_inference/model.py index a5671fec6..af5f86607 100644 --- a/scripts/inference/bloom_ds_inference.py +++ b/inference/ds_inference/model.py @@ -12,8 +12,7 @@ from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_offline_mode from transformers.utils.hub import EntryNotFoundError -import utils -from utils import Model, benchmark_end_to_end, get_argument_parser, print_rank_n, run_rank_n +from utils import Model, print_rank_n, run_rank_n class DSInferenceModel(Model): @@ -150,22 +149,6 @@ def generate(self, return output_text, generated_tokens -def get_args(): - parser = get_argument_parser() - - group = parser.add_argument_group(title="launch config") - group.add_argument("--benchmark_cycles", type=int, - default=0, help="additionally run benchmark") - group.add_argument("--local_rank", required=False, - type=int, help="used by dist launchers") - group.add_argument("--save_mp_checkpoint_path", required=False, - type=str, help="MP checkpoints path") - - args = utils.get_args(parser) - - return args - - def get_checkpoint_files(pretrained_model_name_or_path): # XXX: I just hacked this one together to automatically handle the fetching of the model file or # shards into cache and returning the cached entries - note that I removed most arguments @@ -230,8 +213,3 @@ def write_checkponts_json(checkpoints_json: str, model_name: str) -> None: "version": 1.0 } json.dump(data, f) - - -if (__name__ == "__main__"): - deepspeed.init_distributed('nccl') - benchmark_end_to_end(get_args(), DSInferenceModel) diff --git a/inference/ds_zero/__init__.py b/inference/ds_zero/__init__.py new file mode 100644 index 000000000..846c5618c --- /dev/null +++ b/inference/ds_zero/__init__.py @@ -0,0 +1 @@ +from .model import DSZeROModel diff --git a/scripts/inference/bloom_ds_zero_inference.py b/inference/ds_zero/model.py similarity index 87% rename from scripts/inference/bloom_ds_zero_inference.py rename to inference/ds_zero/model.py index 38e8e7a89..7b9a2b365 100644 --- a/scripts/inference/bloom_ds_zero_inference.py +++ b/inference/ds_zero/model.py @@ -7,11 +7,10 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.deepspeed import HfDeepSpeedConfig -import utils -from utils import Model, benchmark_end_to_end, get_argument_parser, print_rank_n +from utils import Model, print_rank_n -class DSZeroModel(Model): +class DSZeROModel(Model): def __init__(self, args: Namespace) -> None: if (args.local_rank == 0): print("Loading model...") @@ -97,22 +96,6 @@ def generate(self, return output_text, generated_tokens -def get_args(): - parser = get_argument_parser() - - group = parser.add_argument_group(title="launch config") - group.add_argument("--benchmark_cycles", type=int, - default=0, help="additionally run benchmark") - group.add_argument("--local_rank", required=False, - type=int, help="used by dist launchers") - group.add_argument("--cpu_offload", action="store_true", - help="whether to activate CPU offload") - - args = utils.get_args(parser) - - return args - - def get_max_memory_per_gpu_dict(dtype, model_name): """ try to generate the memory map based on what we know about the model and the available hardware """ @@ -156,7 +139,3 @@ def get_max_memory_per_gpu_dict(dtype, model_name): f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} - - -if (__name__ == "__main__"): - benchmark_end_to_end(get_args(), DSZeroModel, zero_activated=True) diff --git a/inference/hf_accelerate/__init__.py b/inference/hf_accelerate/__init__.py new file mode 100644 index 000000000..71a0b52d7 --- /dev/null +++ b/inference/hf_accelerate/__init__.py @@ -0,0 +1 @@ +from .model import HFAccelerateModel diff --git a/scripts/inference/bloom_accelerate_inference.py b/inference/hf_accelerate/model.py similarity index 89% rename from scripts/inference/bloom_accelerate_inference.py rename to inference/hf_accelerate/model.py index 8fc5a6120..5d0a8c7f2 100644 --- a/scripts/inference/bloom_accelerate_inference.py +++ b/inference/hf_accelerate/model.py @@ -4,13 +4,7 @@ import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -import utils -from utils import ( - Model, - benchmark_end_to_end, - get_argument_parser, - print_rank_n -) +from utils import Model, print_rank_n class HFAccelerateModel(Model): @@ -66,18 +60,6 @@ def generate(self, return output_text, generated_tokens -def get_args(): - parser = get_argument_parser() - - group = parser.add_argument_group(title="launch config") - group.add_argument("--benchmark_cycles", type=int, - default=0, help="additionally run benchmark") - - args = utils.get_args(parser) - - return args - - def get_max_memory_per_gpu_dict(dtype, model_name): """ try to generate the memory map based on what we know about the model and the available hardware """ @@ -121,7 +103,3 @@ def get_max_memory_per_gpu_dict(dtype, model_name): f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} - - -if (__name__ == "__main__"): - benchmark_end_to_end(get_args(), HFAccelerateModel) diff --git a/inference/utils/__init__.py b/inference/utils/__init__.py new file mode 100644 index 000000000..58f0a85d2 --- /dev/null +++ b/inference/utils/__init__.py @@ -0,0 +1 @@ +from .utils import MaxTokensError, Model, benchmark_end_to_end, get_args, get_argument_parser, print_rank_n, run_rank_n diff --git a/scripts/inference/utils.py b/inference/utils/utils.py similarity index 100% rename from scripts/inference/utils.py rename to inference/utils/utils.py diff --git a/inference/values.py b/inference/values.py new file mode 100644 index 000000000..8c16f280c --- /dev/null +++ b/inference/values.py @@ -0,0 +1,3 @@ +HF_ACCELERATE = "hf_accelerate" +DS_INFERENCE = "ds_inference" +DS_ZERO = "ds_zero" From 5c31d9a3a9577ea6098b7ad5d1cbc1e4f647e26e Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Mon, 15 Aug 2022 06:09:09 +0530 Subject: [PATCH 07/32] cli --- inference/benchmark.py | 169 +++++++++++++++-- inference/cache_ds_checkpoints.py | 27 +++ inference/cli.py | 79 ++++++++ inference/{values.py => constants.py} | 0 inference/ds_inference/__init__.py | 2 + inference/ds_inference/cache.py | 67 +++++++ inference/ds_inference/grpc_server.py | 83 +++++++++ inference/ds_inference/model.py | 102 ++-------- inference/ds_zero/model.py | 81 +------- inference/hf_accelerate/model.py | 34 ---- inference/utils/__init__.py | 13 +- inference/utils/model.py | 54 ++++++ inference/utils/utils.py | 259 +++++++++++--------------- 13 files changed, 601 insertions(+), 369 deletions(-) create mode 100644 inference/cache_ds_checkpoints.py create mode 100644 inference/cli.py rename inference/{values.py => constants.py} (100%) create mode 100644 inference/ds_inference/cache.py create mode 100644 inference/ds_inference/grpc_server.py create mode 100644 inference/utils/model.py diff --git a/inference/benchmark.py b/inference/benchmark.py index ece6329b1..44478ef71 100644 --- a/inference/benchmark.py +++ b/inference/benchmark.py @@ -1,14 +1,142 @@ +import argparse +import gc +import os +import time +from typing import Any, List, Union + import deepspeed +import torch +import constants import utils -import values from ds_inference import DSInferenceModel from ds_zero import DSZeROModel from hf_accelerate import HFAccelerateModel -from utils import benchmark_end_to_end, get_argument_parser +from utils import Execute, Model, get_argument_parser, get_dummy_batch, print_rank_n + + +def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]: + """ + runs a list of Execute objects and returns a list of outputs and the time taken + """ + start_time = time.time() + + if (type(execs) == list): + results = [] + for e in execs: + results.append(e()) + else: + results = execs() + + time_elapsed = time.time() - start_time + return results, time_elapsed + + +def benchmark_generation(input_sentences, + model, + generate_kwargs, + cycles: int = 5): + total_new_tokens_generated = 0 + for _ in range(cycles): + _, num_generated_tokens = model.generate( + input_sentences, + generate_kwargs + ) + total_new_tokens_generated += sum( + new_tokens for new_tokens in num_generated_tokens) + return total_new_tokens_generated + + +def get_benchmark_results(benchmark_time: float, + initialization_time: float, + generation_time: float, + total_new_tokens_generated: int, + batch_size: int) -> str: + throughput = total_new_tokens_generated / benchmark_time + return f""" +*** Performance stats: +Throughput (including tokenization) = {throughput:.2f} tokens/sec +Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token +Model loading time = {initialization_time:.2f} secs +Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} +Generation time per batch = {generation_time:.2f} secs +Model loading time + generation time per batch = {initialization_time + generation_time:.2f} secs +""" + + +def benchmark_end_to_end(args: argparse.Namespace, + model_class: Model, + zero_activated: bool = False) -> None: + model, initialization_time = run_and_log_time( + Execute(model_class, {"args": args}) + ) + + print_rank_n( + f"*** Starting to generate {args.generate_kwargs['max_new_tokens']} tokens with bs={args.batch_size}") + + input_sentences = get_dummy_batch(args.batch_size) + print_rank_n(f"Generate args {args.generate_kwargs}") -def get_args(): + # warmup is a must if measuring speed as it's when all the optimizations are performed + # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs + model.generate( + input_sentences, + args.generate_kwargs + ) + + (output_text, num_generated_tokens), generation_time = run_and_log_time( + Execute( + model.generate, + { + "text": input_sentences, + "generate_kwargs": args.generate_kwargs + } + ) + ) + for i, (o, _) in zip(input_sentences, zip(output_text, num_generated_tokens)): + print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") + + if (args.benchmark_cycles > 0): + print_rank_n(f"*** Running benchmark") + + torch.cuda.empty_cache() + gc.collect() + + # warm up + model.generate(input_sentences, args.generate_kwargs) + torch.cuda.synchronize() + + # benchmark + total_new_tokens_generated, benchmark_time = run_and_log_time( + Execute( + benchmark_generation, + { + "input_sentences": input_sentences, + "model": model, + "generate_kwargs": args.generate_kwargs, + "cycles": args.benchmark_cycles + } + ) + ) + + # with ZeRO every GPU is generating batch_size * sequence_length tokens + if (zero_activated): + world_size = int(os.getenv('WORLD_SIZE', '1')) + total_new_tokens_generated *= world_size + + print_rank_n( + get_benchmark_results( + benchmark_time, + initialization_time, + generation_time, + total_new_tokens_generated, + args.batch_size + ) + ) + + +def get_args() -> argparse.Namespace: parser = get_argument_parser() group = parser.add_argument_group(title="launch config") @@ -16,11 +144,11 @@ def get_args(): "--deployment_framework", type=str, choices=[ - values.HF_ACCELERATE, - values.DS_INFERENCE, - values.DS_ZERO + constants.HF_ACCELERATE, + constants.DS_INFERENCE, + constants.DS_ZERO ], - default=values.HF_ACCELERATE + default=constants.HF_ACCELERATE ) group.add_argument("--benchmark_cycles", type=int, default=0, help="additionally run benchmark") @@ -34,27 +162,34 @@ def get_args(): args = utils.get_args(parser) launched_with_deepspeed = args.deployment_framework in [ - values.DS_INFERENCE, values.DS_ZERO] + constants.DS_INFERENCE, constants.DS_ZERO] if (not launched_with_deepspeed): assert args.local_rank == None, "local_rank must be None if not launched with DeepSpeed" if (args.save_mp_checkpoint_path): - assert args.deployment_framework == values.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" + assert args.deployment_framework == constants.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" if (args.cpu_offload): - assert args.deployment_framework == values.DS_ZERO, "cpu_offload only works with DS_ZeRO" + assert args.deployment_framework == constants.DS_ZERO, "cpu_offload only works with DS_ZeRO" return args -if (__name__ == "__main__"): +def main() -> None: args = get_args() - if (args.deployment_framework == values.HF_ACCELERATE): - benchmark_end_to_end(get_args(), HFAccelerateModel) - elif (args.deployment_framework == values.DS_INFERENCE): - deepspeed.init_distributed('nccl') - benchmark_end_to_end(get_args(), DSInferenceModel) + if (args.deployment_framework == constants.HF_ACCELERATE): + benchmark_end_to_end(args, HFAccelerateModel) + elif (args.deployment_framework == constants.DS_INFERENCE): + deepspeed.init_distributed("nccl") + benchmark_end_to_end(args, DSInferenceModel) + elif (args.deployment_framework == constants.DS_ZERO): + benchmark_end_to_end(args, DSZeROModel, zero_activated=True) else: - benchmark_end_to_end(get_args(), DSZeROModel, zero_activated=True) + raise ValueError( + f"Unknown deployment framework {args.deployment_framework}") + + +if (__name__ == "__main__"): + main() diff --git a/inference/cache_ds_checkpoints.py b/inference/cache_ds_checkpoints.py new file mode 100644 index 000000000..f3a75c3dd --- /dev/null +++ b/inference/cache_ds_checkpoints.py @@ -0,0 +1,27 @@ +import argparse + +import utils +from ds_inference import cache_ds_checkpoints +from utils import get_argument_parser + + +def get_args() -> argparse.Namespace: + parser = get_argument_parser() + + group = parser.add_argument_group(title="launch config") + group.add_argument("--local_rank", required=False, + type=int, help="used by dist launchers") + group.add_argument("--save_mp_checkpoint_path", required=True, + type=str, help="MP checkpoints path for DS inference") + + args = utils.get_args(parser) + + return args + + +def main() -> None: + cache_ds_checkpoints(get_args()) + + +if (__name__ == "__main__"): + main() diff --git a/inference/cli.py b/inference/cli.py new file mode 100644 index 000000000..1df691841 --- /dev/null +++ b/inference/cli.py @@ -0,0 +1,79 @@ +import argparse +import json + +import deepspeed + +import constants +import utils +from ds_inference import DSInferenceGRPCServer +from hf_accelerate import HFAccelerateModel +from utils import get_argument_parser, parse_generate_kwargs, print_rank_n + + +def get_args() -> argparse.Namespace: + parser = get_argument_parser() + + group = parser.add_argument_group(title="launch config") + group.add_argument( + "--deployment_framework", + type=str, + choices=[ + constants.HF_ACCELERATE, + constants.DS_INFERENCE + ], + default=constants.HF_ACCELERATE + ) + group.add_argument("--save_mp_checkpoint_path", required=False, + type=str, help="MP checkpoints path for DS inference") + group.add_argument("--shutdown_command", required=False, + type=str, default="__shutdown__", help="This string will exit the script") + + args = utils.get_args(parser) + + if (args.save_mp_checkpoint_path): + assert args.deployment_framework == constants.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" + + return args + + +def main() -> None: + args = get_args() + + if (args.deployment_framework == constants.HF_ACCELERATE): + model = HFAccelerateModel(args) + elif (args.deployment_framework == constants.DS_INFERENCE): + model = DSInferenceGRPCServer(args) + else: + raise ValueError( + f"Unknown deployment framework {args.deployment_framework}") + + generate_kwargs = args.generate_kwargs + + while (True): + # currently only 1 process is running so its + # fine but might need to run_rank_n for this + # if running a deployment_framework with + # multiple processes + input_text = input("Input text: ") + + if (input_text == args.shutdown_command): + model.shutdown() + + if (input("change generate_kwargs? [y/n] ") == "y"): + generate_kwargs = input("Generate kwargs: ") + generate_kwargs = json.loads(generate_kwargs) + generate_kwargs = parse_generate_kwargs(generate_kwargs) + print_rank_n("generate_kwargs:", generate_kwargs) + + output_text, num_generated_tokens = model.generate( + input_text, + generate_kwargs, + remove_input_from_output=True + ) + + print_rank_n("Output text:", output_text) + print_rank_n("Generated tokens:", num_generated_tokens) + + +if (__name__ == "__main__"): + main() diff --git a/inference/values.py b/inference/constants.py similarity index 100% rename from inference/values.py rename to inference/constants.py diff --git a/inference/ds_inference/__init__.py b/inference/ds_inference/__init__.py index 0f1181842..654495525 100644 --- a/inference/ds_inference/__init__.py +++ b/inference/ds_inference/__init__.py @@ -1 +1,3 @@ +from .cache import cache_ds_checkpoints +from .grpc_server import DSInferenceGRPCServer from .model import DSInferenceModel diff --git a/inference/ds_inference/cache.py b/inference/ds_inference/cache.py new file mode 100644 index 000000000..00dffab89 --- /dev/null +++ b/inference/ds_inference/cache.py @@ -0,0 +1,67 @@ +import argparse +import os +import shutil + +import deepspeed +import torch +from transformers import AutoConfig, AutoModelForCausalLM + +from utils import print_rank_n, run_rank_n + +from .model import write_checkponts_json + + +def cache_ds_checkpoints(args: argparse.Namespace) -> None: + print_rank_n("Loading model...") + world_size = int(os.getenv("WORLD_SIZE", "1")) + + # Load model + with deepspeed.OnDevice(dtype=args.dtype, device="meta"): + model = AutoModelForCausalLM.from_config( + AutoConfig.from_pretrained(args.model_name), + torch_dtype=torch.bfloat16 + ) + model = model.eval() + + # Write checkpoints.json + tmp_directory = "tmp" + run_rank_n( + os.makedirs, + { + "name": tmp_directory, + "exist_ok": True + } + ) + checkpoints_json = os.path.join(tmp_directory, "checkpoints.json") + run_rank_n( + write_checkponts_json, + { + "checkpoints_json": checkpoints_json, + "model_name": args.model_name + }, + barrier=True + ) + + run_rank_n( + os.makedirs, + { + "name": args.save_mp_checkpoint_path, + "exist_ok": True + }, + barrier=True + ) + + if (args.dtype == torch.float16): + model = deepspeed.init_inference( + model, + mp_size=world_size, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True, + save_mp_checkpoint_path=args.save_mp_checkpoint_path + ) + elif (args.dtype == torch.bfloat16): + raise NotImplementedError("bfloat16 is not yet supported") + + run_rank_n(shutil.rmtree, {"path": tmp_directory}) + print_rank_n("Model loaded") diff --git a/inference/ds_inference/grpc_server.py b/inference/ds_inference/grpc_server.py new file mode 100644 index 000000000..8dacfc0ef --- /dev/null +++ b/inference/ds_inference/grpc_server.py @@ -0,0 +1,83 @@ +import argparse +import json +import os +from typing import List, Tuple, Union + +import torch +from transformers import AutoTokenizer + +import mii +from utils import Model, get_str_dtype + + +class DSInferenceGRPCServer(Model): + def __init__(self, args: argparse.Namespace) -> None: + self.deployment_name = "ds_inference_grpc_server" + + files = os.listdir(args.save_mp_checkpoint_path) + for file in files: + if (file.endswith(".json")): + checkpoints_json = json.load( + open(os.path.join(args.save_mp_checkpoint_path, file), "r")) + del checkpoints_json["base_dir"] + break + + if (args.dtype == torch.float16): + mii.deploy( + task="text-generation", + model=args.model_name, + deployment_name=self.deployment_name, + mii_config={ + "dtype": get_str_dtype(args.dtype), + "tensor_parallel": 8, + "port_number": 50950, + "checkpoint_dict": checkpoints_json + }, + model_path=args.save_mp_checkpoint_path + ) + else: + raise NotImplementedError("This is not yet supported") + + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.model = mii.mii_query_handle(self.deployment_name) + + def generate(self, + text: Union[str, List[str]], + generate_kwargs: dict, + remove_input_from_output: bool = False) -> Union[Tuple[str, int], + Tuple[List[str], List[int]]]: + return_format = type(text) + if (return_format == str): + text = [text] + + output_text = self.model.query( + { + "query": text + }, + **generate_kwargs + ).response + + output_text = [_ for _ in output_text] + + # Remove input from output + input_tokens = self.tokenizer(text).input_ids + output_tokens = self.tokenizer(output_text).input_ids + + input_token_lengths = [len(x) for x in input_tokens] + output_token_lengths = [len(x) for x in output_tokens] + generated_tokens = [ + o - i for i, o in zip(input_token_lengths, output_token_lengths)] + + if (remove_input_from_output): + output_tokens = [x[-i:] + for x, i in zip(output_tokens, generated_tokens)] + output_text = self.tokenizer.batch_decode( + output_tokens, skip_special_tokens=True) + + if (return_format == str): + return output_text[0], generated_tokens[0] + return output_text, generated_tokens + + def shutdown(self) -> None: + mii.terminate(self.deployment_name) + exit() diff --git a/inference/ds_inference/model.py b/inference/ds_inference/model.py index af5f86607..9ee6e79c2 100644 --- a/inference/ds_inference/model.py +++ b/inference/ds_inference/model.py @@ -3,7 +3,6 @@ import os import shutil from argparse import Namespace -from typing import List, Union import deepspeed import torch @@ -18,15 +17,10 @@ class DSInferenceModel(Model): def __init__(self, args: Namespace) -> None: print_rank_n("Loading model...") + world_size = int(os.getenv("WORLD_SIZE", "1")) self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) - use_hf_checkpoints = not args.save_mp_checkpoint_path - cache_ds_checkpoints = args.save_mp_checkpoint_path and not os.path.isdir( - args.save_mp_checkpoint_path) - - world_size = int(os.getenv("WORLD_SIZE", "1")) - # Load model with deepspeed.OnDevice(dtype=args.dtype, device="meta"): self.model = AutoModelForCausalLM.from_config( @@ -54,59 +48,20 @@ def __init__(self, args: Namespace) -> None: barrier=True ) - # init inference - if (use_hf_checkpoints): - print_rank_n("Loading HF checkpoints") - - if (args.dtype == torch.float16): - self.model = deepspeed.init_inference( - self.model, - mp_size=world_size, - dtype=args.dtype, - checkpoint=checkpoints_json, - replace_with_kernel_inject=True - ) - else: - raise NotImplementedError("bfloat16 is not yet supported") - elif (cache_ds_checkpoints): - print_rank_n("Caching DS checkpoints and loading model") - - run_rank_n( - os.makedirs, - { - "name": args.save_mp_checkpoint_path, - "exist_ok": True - }, - barrier=True - ) - - if (args.dtype == torch.float16): - self.model = deepspeed.init_inference( - self.model, - mp_size=world_size, - dtype=args.dtype, - checkpoint=checkpoints_json, - replace_with_kernel_inject=True, - save_mp_checkpoint_path=args.save_mp_checkpoint_path - ) - else: - raise NotImplementedError("bfloat16 is not yet supported") - else: - print_rank_n("Loading DS cached checkpoints") - + if (args.save_mp_checkpoint_path): checkpoints_json = os.path.join( args.save_mp_checkpoint_path, "BLOOM-176B_ds-inference_config.json") - if (args.dtype == torch.float16): - self.model = deepspeed.init_inference( - self.model, - mp_size=world_size, - dtype=args.dtype, - checkpoint=checkpoints_json, - replace_with_kernel_inject=True - ) - else: - raise NotImplementedError("bfloat16 is not yet supported") + if (args.dtype == torch.float16): + self.model = deepspeed.init_inference( + self.model, + mp_size=world_size, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True + ) + elif (args.dtype == torch.bfloat16): + raise NotImplementedError("bfloat16 is not yet supported") run_rank_n(shutil.rmtree, {"path": tmp_directory}) @@ -115,39 +70,6 @@ def __init__(self, args: Namespace) -> None: print_rank_n("Model loaded") - def generate(self, - text: Union[str, List[str]], - generate_kwargs: dict, - remove_input_from_output: bool = False) -> Union[str, List[str]]: - if (type(text) == str): - text = [text] - - input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) - - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(self.input_device) - - with torch.no_grad(): - output_tokens = self.model.generate( - **input_tokens, - **generate_kwargs - ) - - input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] - output_token_lengths = [x.shape[0] for x in output_tokens] - generated_tokens = [ - o - i for i, o in zip(input_token_lengths, output_token_lengths)] - - if (remove_input_from_output): - output_tokens = [x[-i:] - for x, i in zip(output_tokens, generated_tokens)] - - output_text = self.tokenizer.batch_decode( - output_tokens, skip_special_tokens=True) - - return output_text, generated_tokens - def get_checkpoint_files(pretrained_model_name_or_path): # XXX: I just hacked this one together to automatically handle the fetching of the model file or diff --git a/inference/ds_zero/model.py b/inference/ds_zero/model.py index 7b9a2b365..0db700869 100644 --- a/inference/ds_zero/model.py +++ b/inference/ds_zero/model.py @@ -1,13 +1,12 @@ import os from argparse import Namespace -from typing import List, Union import deepspeed import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.deepspeed import HfDeepSpeedConfig -from utils import Model, print_rank_n +from utils import Model class DSZeROModel(Model): @@ -61,81 +60,3 @@ def __init__(self, args: Namespace) -> None: self.model = self.model.module self.input_device = torch.cuda.current_device() - - def generate(self, - text: Union[str, List[str]], - generate_kwargs: dict, - remove_input_from_output: bool = False) -> Union[str, List[str]]: - if (type(text) == str): - text = [text] - - input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) - - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(self.input_device) - - with torch.no_grad(): - output_tokens = self.model.generate( - **input_tokens, - **generate_kwargs - ) - - input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] - output_token_lengths = [x.shape[0] for x in output_tokens] - generated_tokens = [ - o - i for i, o in zip(input_token_lengths, output_token_lengths)] - - if (remove_input_from_output): - output_tokens = [x[-i:] - for x, i in zip(output_tokens, generated_tokens)] - - output_text = self.tokenizer.batch_decode( - output_tokens, skip_special_tokens=True) - - return output_text, generated_tokens - - -def get_max_memory_per_gpu_dict(dtype, model_name): - """ try to generate the memory map based on what we know about the model and the available hardware """ - - # figure out the memory map - the minimum per gpu required to load the model - n_gpus = torch.cuda.device_count() - - if model_name == "bigscience/bloom" and n_gpus == 8 and torch.cuda.get_device_properties(0).total_memory > 79*2**30: - # hand crafted optimized memory map for 8x80 setup over BLOOM - # this works with bs=40 - return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} - - try: - # model_params calculation, as we don't have a model yet to do: - #model_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) - - config = AutoConfig.from_pretrained(model_name) - h = config.n_embed - l = config.n_layer - v = config.vocab_size - # from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing - model_params = l*(12*h**2 + 13*h) + v*h + 4*h - except: - print_rank_n( - f"The model {model_name} has a broken config file. Please notify the owner") - raise - - bytes = torch.finfo(dtype).bits / 8 - param_memory_total_in_bytes = model_params * bytes - # add 5% since weight sizes aren't the same and some GPU may need more memory - param_memory_per_gpu_in_bytes = int( - param_memory_total_in_bytes / n_gpus * 1.05) - print_rank_n( - f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") - - # check the real available memory - # load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB) - torch.ones(1).cuda() - max_memory_per_gpu_in_bytes = torch.cuda.mem_get_info(0)[0] - if max_memory_per_gpu_in_bytes < param_memory_per_gpu_in_bytes: - raise ValueError( - f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") - - return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} diff --git a/inference/hf_accelerate/model.py b/inference/hf_accelerate/model.py index 5d0a8c7f2..2ce8ae020 100644 --- a/inference/hf_accelerate/model.py +++ b/inference/hf_accelerate/model.py @@ -1,5 +1,4 @@ from argparse import Namespace -from typing import List, Union import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer @@ -26,39 +25,6 @@ def __init__(self, args: Namespace) -> None: print_rank_n("Model loaded") - def generate(self, - text: Union[str, List[str]], - generate_kwargs: dict, - remove_input_from_output: bool = False) -> Union[str, List[str]]: - if (type(text) == str): - text = [text] - - input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) - - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(self.input_device) - - with torch.no_grad(): - output_tokens = self.model.generate( - **input_tokens, - **generate_kwargs - ) - - input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] - output_token_lengths = [x.shape[0] for x in output_tokens] - generated_tokens = [ - o - i for i, o in zip(input_token_lengths, output_token_lengths)] - - if (remove_input_from_output): - output_tokens = [x[-i:] - for x, i in zip(output_tokens, generated_tokens)] - - output_text = self.tokenizer.batch_decode( - output_tokens, skip_special_tokens=True) - - return output_text, generated_tokens - def get_max_memory_per_gpu_dict(dtype, model_name): """ try to generate the memory map based on what we know about the model and the available hardware """ diff --git a/inference/utils/__init__.py b/inference/utils/__init__.py index 58f0a85d2..f66c16623 100644 --- a/inference/utils/__init__.py +++ b/inference/utils/__init__.py @@ -1 +1,12 @@ -from .utils import MaxTokensError, Model, benchmark_end_to_end, get_args, get_argument_parser, print_rank_n, run_rank_n +from .model import Model +from .utils import ( + Execute, + MaxTokensError, + get_args, + get_argument_parser, + get_dummy_batch, + get_str_dtype, + parse_generate_kwargs, + print_rank_n, + run_rank_n +) diff --git a/inference/utils/model.py b/inference/utils/model.py new file mode 100644 index 000000000..f21239524 --- /dev/null +++ b/inference/utils/model.py @@ -0,0 +1,54 @@ +import argparse +from typing import List, Tuple, Union + +import torch + + +class Model: + def __init__(self, args: argparse.Namespace) -> None: + self.tokenizer = None + self.model = None + self.input_device = None + raise NotImplementedError("This is a dummy class") + + def generate(self, + text: Union[str, List[str]], + generate_kwargs: dict, + remove_input_from_output: bool = False) -> Union[Tuple[str, int], + Tuple[List[str], List[int]]]: + return_type = type(text) + if (return_type == str): + text = [text] + + input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) + + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(self.input_device) + + with torch.no_grad(): + output_tokens = self.model.generate( + **input_tokens, + **generate_kwargs + ) + + input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_token_lengths = [x.shape[0] for x in output_tokens] + generated_tokens = [ + o - i for i, o in zip(input_token_lengths, output_token_lengths)] + + if (remove_input_from_output): + output_tokens = [x[-i:] + for x, i in zip(output_tokens, generated_tokens)] + + output_text = self.tokenizer.batch_decode( + output_tokens, skip_special_tokens=True) + + if (return_type == str): + output_text = output_text[0] + generated_tokens = generated_tokens[0] + + return output_text, generated_tokens + + def shutdown(self) -> None: + exit() diff --git a/inference/utils/utils.py b/inference/utils/utils.py index 0ddc4497d..d78362a9f 100644 --- a/inference/utils/utils.py +++ b/inference/utils/utils.py @@ -1,10 +1,7 @@ import argparse import copy -import gc import math -import os -import time -from typing import Any, List, Union +from typing import Any, List import torch import torch.distributed as dist @@ -37,18 +34,7 @@ def __call__(self) -> Any: return self.func(**self.kwargs) -class Model: - def __init__(self, args: argparse.Namespace) -> None: - raise NotImplementedError("This is a dummy class") - - def generate(self, - text: Union[str, List[str]], - generate_kwargs: dict, - remove_input_from_output: bool = False) -> Union[str, List[str]]: - raise NotImplementedError("This is a dummy class") - - -def get_argument_parser(): +def get_argument_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() group = parser.add_argument_group(title="model") @@ -57,13 +43,20 @@ def get_argument_parser(): group.add_argument("--dtype", type=str, required=True, choices=["bf16", "fp16"], help="dtype for model") group.add_argument("--batch_size", default=1, type=int, help="batch size") - group.add_argument("--generate_kwargs", type=dict, default={}, - help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters") + group.add_argument( + "--generate_kwargs", + type=dict, + default={ + "max_new_tokens": 100, + "do_sample": False + }, + help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters" + ) return parser -def get_args(parser: argparse.ArgumentParser): +def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args = parser.parse_args() args.dtype = get_torch_dtype(args.dtype) return args @@ -72,7 +65,8 @@ def get_args(parser: argparse.ArgumentParser): def run_rank_n(func: callable, kwargs: dict, barrier: bool = False, - rank: int = 0) -> Any: + rank: int = 0, + other_rank_output: Any = None) -> Any: if (dist.is_initialized()): if (dist.get_rank() == rank): output = func(**kwargs) @@ -82,6 +76,7 @@ def run_rank_n(func: callable, else: if (barrier): dist.barrier() + return other_rank_output else: return func(**kwargs) @@ -101,6 +96,13 @@ def get_torch_dtype(dtype_str: str) -> torch.dtype: return torch.float16 +def get_str_dtype(dtype_str: str) -> torch.dtype: + if (dtype_str == torch.bfloat16): + return "bf16" + elif (dtype_str == torch.float16): + return "fp16" + + def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]: if (input_sentences == None): input_sentences = copy.deepcopy(dummy_input_sentences) @@ -112,130 +114,93 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[ return input_sentences -def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]: - """ - runs a list of Execute objects and returns a list of outputs and the time taken - """ - start_time = time.time() - - if (type(execs) == list): - results = [] - for e in execs: - results.append(e()) - else: - results = execs() - - time_elapsed = time.time() - start_time - return results, time_elapsed - - -def benchmark_generation(input_sentences, - model, - generate_kwargs, - cycles: int = 5): - total_new_tokens_generated = 0 - for _ in range(cycles): - _, num_generated_tokens = model.generate( - input_sentences, - generate_kwargs - ) - total_new_tokens_generated += sum( - new_tokens for new_tokens in num_generated_tokens) - return total_new_tokens_generated - - -def get_benchmark_results(benchmark_time: float, - initialization_time: float, - generation_time: float, - total_new_tokens_generated: int, - batch_size: int) -> str: - throughput = total_new_tokens_generated / benchmark_time - return f""" -*** Performance stats: -Throughput (including tokenization) = {throughput:.2f} tokens/sec -Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token -Model loading time = {initialization_time:.2f} secs -Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} -Generation time per batch = {generation_time:.2f} secs -Model loading time + generation time per batch = {initialization_time + generation_time:.2f} secs -""" - - -def benchmark_end_to_end(args: argparse.Namespace, - model_class: Model, - zero_activated: bool = False) -> None: - model, initialization_time = run_and_log_time( - Execute(model_class, {"args": args}) - ) - - if (args.generate_kwargs): - generate_kwargs = args.generate_kwargs - else: - generate_kwargs = { - "max_new_tokens": 100, - "do_sample": False - } - - print_rank_n( - f"*** Starting to generate {generate_kwargs['max_new_tokens']} tokens with bs={args.batch_size}") - - input_sentences = get_dummy_batch(args.batch_size) - - print_rank_n(f"Generate args {generate_kwargs}") - - # warmup is a must if measuring speed as it's when all the optimizations are performed - # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - model.generate( - input_sentences, - generate_kwargs - ) - - (output_text, num_generated_tokens), generation_time = run_and_log_time( - Execute( - model.generate, - { - "text": input_sentences, - "generate_kwargs": generate_kwargs - } - ) - ) - for i, (o, _) in zip(input_sentences, zip(output_text, num_generated_tokens)): - print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") - - if (args.benchmark_cycles > 0): - print_rank_n(f"*** Running benchmark") - - torch.cuda.empty_cache() - gc.collect() - - # warm up - model.generate(input_sentences, generate_kwargs) - torch.cuda.synchronize() - - # benchmark - total_new_tokens_generated, benchmark_time = run_and_log_time( - Execute( - benchmark_generation, - { - "input_sentences": input_sentences, - "model": model, - "generate_kwargs": generate_kwargs, - "cycles": args.benchmark_cycles - } - ) - ) - - # with ZeRO every GPU is generating batch_size * sequence_length tokens - if (zero_activated): - world_size = int(os.getenv('WORLD_SIZE', '1')) - total_new_tokens_generated *= world_size - - print_rank_n( - get_benchmark_results( - benchmark_time, - initialization_time, - generation_time, - total_new_tokens_generated, - args.batch_size - ) - ) +def parse_generate_kwargs(kwargs: dict) -> dict: + if ("max_length" in kwargs): + kwargs["max_length"] = int(kwargs["max_length"]) + if ("min_length" in kwargs): + kwargs["min_length"] = int(kwargs["min_length"]) + if ("do_sample" in kwargs): + kwargs["do_sample"] = bool(kwargs["do_sample"]) + if ("early_stopping" in kwargs): + kwargs["early_stopping"] = bool(kwargs["early_stopping"]) + if ("num_beams" in kwargs): + kwargs["num_beams"] = int(kwargs["num_beams"]) + if ("temperature" in kwargs): + kwargs["temperature"] = float(kwargs["temperature"]) + if ("top_k" in kwargs): + kwargs["top_k"] = int(kwargs["top_k"]) + if ("top_p" in kwargs): + kwargs["top_p"] = float(kwargs["top_p"]) + if ("typical_p" in kwargs): + kwargs["typical_p"] = float(kwargs["typical_p"]) + if ("repitition_penalty" in kwargs): + kwargs["repitition_penalty"] = float(kwargs["repitition_penalty"]) + if ("bos_token_id" in kwargs): + kwargs["bos_token_id"] = int(kwargs["bos_token_id"]) + if ("pad_token_id" in kwargs): + kwargs["pad_token_id"] = int(kwargs["pad_token_id"]) + if ("eos_token_id" in kwargs): + kwargs["eos_token_id"] = int(kwargs["eos_token_id"]) + if ("length_penalty" in kwargs): + kwargs["length_penalty"] = float(kwargs["length_penalty"]) + if ("no_repeat_ngram_size" in kwargs): + kwargs["no_repeat_ngram_size"] = int(kwargs["no_repeat_ngram_size"]) + if ("encoder_no_repeat_ngram_size" in kwargs): + kwargs["encoder_no_repeat_ngram_size"] = int( + kwargs["encoder_no_repeat_ngram_size"]) + if ("num_return_sequences" in kwargs): + kwargs["num_return_sequences"] = int(kwargs["num_return_sequences"]) + if ("max_time" in kwargs): + kwargs["max_time"] = float(kwargs["max_time"]) + if ("max_new_tokens" in kwargs): + kwargs["max_new_tokens"] = int(kwargs["max_new_tokens"]) + if ("decoder_start_token_id" in kwargs): + kwargs["decoder_start_token_id"] = int( + kwargs["decoder_start_token_id"]) + if ("num_beam_groups" in kwargs): + kwargs["num_beam_groups"] = int(kwargs["num_beam_groups"]) + if ("diversity_penalty" in kwargs): + kwargs["diversity_penalty"] = float(kwargs["diversity_penalty"]) + if ("forced_bos_token_id" in kwargs): + kwargs["forced_bos_token_id"] = int(kwargs["forced_bos_token_id"]) + if ("forced_eos_token_id" in kwargs): + kwargs["forced_eos_token_id"] = int(kwargs["forced_eos_token_id"]) + if ("exponential_decay_length_penalty" in kwargs): + kwargs["exponential_decay_length_penalty"] = float( + kwargs["exponential_decay_length_penalty"]) + + # i was being lazy :) + if ("bad_words_ids" in kwargs): + del kwargs["bad_words_ids"] + if ("force_words_ids" in kwargs): + del kwargs["force_words_ids"] + + # so people don't slow down the server + if ("use_cache" in kwargs): + del kwargs["use_cache"] + if ("remove_invalid_values" in kwargs): + del kwargs["remove_invalid_values"] + if ("synced_gpus" in kwargs): + del kwargs["synced_gpus"] + + # no idea how to support this in a server setting + if ("prefix_allowed_tokens_fn" in kwargs): + del kwargs["prefix_allowed_tokens_fn"] + if ("logits_processor" in kwargs): + del kwargs["logits_processor"] + if ("renormalize_logits" in kwargs): + del kwargs["renormalize_logits"] + if ("stopping_criteria" in kwargs): + del kwargs["stopping_criteria"] + if ("constraints" in kwargs): + del kwargs["constraints"] + if ("output_attentions" in kwargs): + del kwargs["output_attentions"] + if ("output_hidden_states" in kwargs): + del kwargs["output_hidden_states"] + if ("output_scores" in kwargs): + del kwargs["output_scores"] + if ("return_dict_in_generate" in kwargs): + del kwargs["return_dict_in_generate"] + + return kwargs From 29059555df46994a4ea90d5362dcfadbdf6c1a0b Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Mon, 15 Aug 2022 08:48:25 +0530 Subject: [PATCH 08/32] server --- inference/benchmark.py | 1 + inference/cli.py | 5 +- inference/server.py | 118 +++++++++++++++++++++++++++++++++++++++ inference/utils/utils.py | 24 ++++++-- 4 files changed, 140 insertions(+), 8 deletions(-) create mode 100644 inference/server.py diff --git a/inference/benchmark.py b/inference/benchmark.py index 44478ef71..69aacae7c 100644 --- a/inference/benchmark.py +++ b/inference/benchmark.py @@ -154,6 +154,7 @@ def get_args() -> argparse.Namespace: default=0, help="additionally run benchmark") group.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") + group.add_argument("--batch_size", default=1, type=int, help="batch size") group.add_argument("--save_mp_checkpoint_path", required=False, type=str, help="MP checkpoints path for DS inference") group.add_argument("--cpu_offload", action="store_true", diff --git a/inference/cli.py b/inference/cli.py index 1df691841..4ff051fa3 100644 --- a/inference/cli.py +++ b/inference/cli.py @@ -62,13 +62,14 @@ def main() -> None: if (input("change generate_kwargs? [y/n] ") == "y"): generate_kwargs = input("Generate kwargs: ") generate_kwargs = json.loads(generate_kwargs) - generate_kwargs = parse_generate_kwargs(generate_kwargs) + generate_kwargs, remove_input_from_output = parse_generate_kwargs( + generate_kwargs) print_rank_n("generate_kwargs:", generate_kwargs) output_text, num_generated_tokens = model.generate( input_text, generate_kwargs, - remove_input_from_output=True + remove_input_from_output=remove_input_from_output ) print_rank_n("Output text:", output_text) diff --git a/inference/server.py b/inference/server.py new file mode 100644 index 000000000..e6d2ed2b6 --- /dev/null +++ b/inference/server.py @@ -0,0 +1,118 @@ +import argparse +import logging +import time +from typing import List, Union + +import constants +import utils +from ds_inference import DSInferenceGRPCServer +from fastapi import FastAPI, HTTPException +from hf_accelerate import HFAccelerateModel +from pydantic import BaseModel +from utils import MaxTokensError, get_argument_parser, parse_generate_kwargs +from uvicorn import run + + +class GenerateRequest(BaseModel): + text: Union[List[str], str] + generate_kwargs: dict + + +class GenerateResponse(BaseModel): + text: Union[List[str], str] + num_generated_tokens: Union[List[int], int] + query_id: int + total_time_taken: float + + +def get_args() -> argparse.Namespace: + parser = get_argument_parser() + + group = parser.add_argument_group(title="launch config") + group.add_argument( + "--deployment_framework", + type=str, + choices=[ + constants.HF_ACCELERATE, + constants.DS_INFERENCE, + ], + default=constants.HF_ACCELERATE + ) + group.add_argument("--save_mp_checkpoint_path", required=False, + type=str, help="MP checkpoints path for DS inference") + group.add_argument("--log_file", type=str, help="log data") + group.add_argument("--host", type=str, required=True, help="host address") + group.add_argument("--port", type=int, required=True, help="port number") + group.add_argument("--workers", type=int, default=1, + help="number of http workers") + group.add_argument("--allowed_max_new_tokens", type=int, + default=100, help="max allowed tokens") + + args = utils.get_args(parser) + + return args + + +#################################################################################### +args = get_args() +app = FastAPI() + +# Setup logging +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d : %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + filename=args.log_file +) +logger = logging.getLogger(__name__) + +if (args.deployment_framework == constants.HF_ACCELERATE): + model = HFAccelerateModel(args) +elif (args.deployment_framework == constants.DS_INFERENCE): + model = DSInferenceGRPCServer(args) + +query_id = 0 +#################################################################################### + + +@app.post("/generate/") +def generate(request: GenerateRequest) -> dict: + # needs to be global since it is updated + global query_id + + try: + start_time = time.time() + + text = request.text + generate_kwargs = args.generate_kwargs + remove_input_from_output = False + if (request.generate_kwargs): + generate_kwargs, remove_input_from_output = parse_generate_kwargs( + request.generate_kwargs) + + if (generate_kwargs["max_new_tokens"] > args.allowed_max_new_tokens): + raise MaxTokensError( + generate_kwargs["max_new_tokens"], args.allowed_max_new_tokens) + + output_text, num_generated_tokens = model.generate( + text, + generate_kwargs, + remove_input_from_output=remove_input_from_output + ) + + total_time_taken = time.time() - start_time + + output = GenerateResponse( + text=output_text, + num_generated_tokens=num_generated_tokens, + query_id=query_id, + total_time_taken=total_time_taken + ) + query_id += 1 + return output + except Exception as e: + query_id += 1 + raise HTTPException(500, {"error": str(e)}) + + +run(app, host=args.host, port=args.port, workers=args.workers) diff --git a/inference/utils/utils.py b/inference/utils/utils.py index d78362a9f..d294cd5d7 100644 --- a/inference/utils/utils.py +++ b/inference/utils/utils.py @@ -42,7 +42,6 @@ def get_argument_parser() -> argparse.ArgumentParser: required=True, help="model to use") group.add_argument("--dtype", type=str, required=True, choices=["bf16", "fp16"], help="dtype for model") - group.add_argument("--batch_size", default=1, type=int, help="batch size") group.add_argument( "--generate_kwargs", type=dict, @@ -114,15 +113,22 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[ return input_sentences +def parse_bool(value: str) -> bool: + if (value.lower() == "true"): + return True + elif (value.lower() == "false"): + return False + else: + raise ValueError("{} is not a valid boolean value".format(value)) + + def parse_generate_kwargs(kwargs: dict) -> dict: - if ("max_length" in kwargs): - kwargs["max_length"] = int(kwargs["max_length"]) if ("min_length" in kwargs): kwargs["min_length"] = int(kwargs["min_length"]) if ("do_sample" in kwargs): - kwargs["do_sample"] = bool(kwargs["do_sample"]) + kwargs["do_sample"] = parse_bool(kwargs["do_sample"]) if ("early_stopping" in kwargs): - kwargs["early_stopping"] = bool(kwargs["early_stopping"]) + kwargs["early_stopping"] = parse_bool(kwargs["early_stopping"]) if ("num_beams" in kwargs): kwargs["num_beams"] = int(kwargs["num_beams"]) if ("temperature" in kwargs): @@ -182,6 +188,8 @@ def parse_generate_kwargs(kwargs: dict) -> dict: del kwargs["remove_invalid_values"] if ("synced_gpus" in kwargs): del kwargs["synced_gpus"] + if ("max_length" in kwargs): + del kwargs["max_length"] # no idea how to support this in a server setting if ("prefix_allowed_tokens_fn" in kwargs): @@ -203,4 +211,8 @@ def parse_generate_kwargs(kwargs: dict) -> dict: if ("return_dict_in_generate" in kwargs): del kwargs["return_dict_in_generate"] - return kwargs + remove_input_from_output = False + if ("remove_input_from_output" in kwargs): + remove_input_from_output = parse_bool(kwargs["remove_input_from_output"]) + + return kwargs, remove_input_from_output From 12c4cf7b5b0378029542ca730377ff7e2f13fb5d Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Tue, 16 Aug 2022 19:01:28 +0530 Subject: [PATCH 09/32] request --- inference/benchmark.py | 38 +++------ inference/cli.py | 20 ++--- inference/ds_inference/grpc_server.py | 67 +++++++++++----- inference/server.py | 51 +++--------- inference/utils/__init__.py | 1 + inference/utils/model.py | 45 ++++++++--- inference/utils/requests.py | 98 +++++++++++++++++++++++ inference/utils/utils.py | 107 +------------------------- 8 files changed, 215 insertions(+), 212 deletions(-) create mode 100644 inference/utils/requests.py diff --git a/inference/benchmark.py b/inference/benchmark.py index 69aacae7c..31af2a895 100644 --- a/inference/benchmark.py +++ b/inference/benchmark.py @@ -12,7 +12,7 @@ from ds_inference import DSInferenceModel from ds_zero import DSZeROModel from hf_accelerate import HFAccelerateModel -from utils import Execute, Model, get_argument_parser, get_dummy_batch, print_rank_n +from utils import Execute, Model, get_argument_parser, get_dummy_batch, print_rank_n, GenerateRequest def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]: @@ -32,18 +32,14 @@ def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], f return results, time_elapsed -def benchmark_generation(input_sentences, - model, - generate_kwargs, +def benchmark_generation(model: Model, + request: GenerateRequest, cycles: int = 5): total_new_tokens_generated = 0 for _ in range(cycles): - _, num_generated_tokens = model.generate( - input_sentences, - generate_kwargs - ) + response = model.generate(request) total_new_tokens_generated += sum( - new_tokens for new_tokens in num_generated_tokens) + new_tokens for new_tokens in response.num_generated_tokens) return total_new_tokens_generated @@ -80,21 +76,12 @@ def benchmark_end_to_end(args: argparse.Namespace, # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - model.generate( - input_sentences, - args.generate_kwargs - ) + request = GenerateRequest(input_sentences, args.generate_kwargs) - (output_text, num_generated_tokens), generation_time = run_and_log_time( - Execute( - model.generate, - { - "text": input_sentences, - "generate_kwargs": args.generate_kwargs - } - ) - ) - for i, (o, _) in zip(input_sentences, zip(output_text, num_generated_tokens)): + response, generation_time = run_and_log_time( + Execute(model.generate, {"request": request})) + + for i, (o, _) in zip(request.text, zip(response.text, response.num_generated_tokens)): print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") if (args.benchmark_cycles > 0): @@ -104,7 +91,7 @@ def benchmark_end_to_end(args: argparse.Namespace, gc.collect() # warm up - model.generate(input_sentences, args.generate_kwargs) + model.generate(request) torch.cuda.synchronize() # benchmark @@ -112,9 +99,8 @@ def benchmark_end_to_end(args: argparse.Namespace, Execute( benchmark_generation, { - "input_sentences": input_sentences, "model": model, - "generate_kwargs": args.generate_kwargs, + "request": request, "cycles": args.benchmark_cycles } ) diff --git a/inference/cli.py b/inference/cli.py index 4ff051fa3..e54a0b192 100644 --- a/inference/cli.py +++ b/inference/cli.py @@ -48,6 +48,7 @@ def main() -> None: f"Unknown deployment framework {args.deployment_framework}") generate_kwargs = args.generate_kwargs + request = parse_generate_kwargs(generate_kwargs) while (True): # currently only 1 process is running so its @@ -62,18 +63,13 @@ def main() -> None: if (input("change generate_kwargs? [y/n] ") == "y"): generate_kwargs = input("Generate kwargs: ") generate_kwargs = json.loads(generate_kwargs) - generate_kwargs, remove_input_from_output = parse_generate_kwargs( - generate_kwargs) - print_rank_n("generate_kwargs:", generate_kwargs) - - output_text, num_generated_tokens = model.generate( - input_text, - generate_kwargs, - remove_input_from_output=remove_input_from_output - ) - - print_rank_n("Output text:", output_text) - print_rank_n("Generated tokens:", num_generated_tokens) + request = parse_generate_kwargs(generate_kwargs) + + request.text = input_text + response = model.generate(request) + + print_rank_n("Output text:", response.text) + print_rank_n("Generated tokens:", response.num_generated_tokens) if (__name__ == "__main__"): diff --git a/inference/ds_inference/grpc_server.py b/inference/ds_inference/grpc_server.py index 8dacfc0ef..e1eece44a 100644 --- a/inference/ds_inference/grpc_server.py +++ b/inference/ds_inference/grpc_server.py @@ -1,13 +1,12 @@ import argparse import json import os -from typing import List, Tuple, Union import torch from transformers import AutoTokenizer import mii -from utils import Model, get_str_dtype +from utils import GenerateRequest, GenerateResponse, Model, get_str_dtype class DSInferenceGRPCServer(Model): @@ -19,9 +18,11 @@ def __init__(self, args: argparse.Namespace) -> None: if (file.endswith(".json")): checkpoints_json = json.load( open(os.path.join(args.save_mp_checkpoint_path, file), "r")) - del checkpoints_json["base_dir"] break + if ("base_dir" in checkpoints_json): + del checkpoints_json["base_dir"] + if (args.dtype == torch.float16): mii.deploy( task="text-generation", @@ -41,20 +42,41 @@ def __init__(self, args: argparse.Namespace) -> None: self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) self.model = mii.mii_query_handle(self.deployment_name) - def generate(self, - text: Union[str, List[str]], - generate_kwargs: dict, - remove_input_from_output: bool = False) -> Union[Tuple[str, int], - Tuple[List[str], List[int]]]: - return_format = type(text) - if (return_format == str): + def generate(self, request: GenerateRequest) -> GenerateResponse: + text = request.text + + return_type = type(text) + if (return_type == str): text = [text] output_text = self.model.query( - { - "query": text - }, - **generate_kwargs + {"query": text}, + min_length=request.min_length, + do_sample=request.do_sample, + early_stopping=request.early_stopping, + num_beams=request.num_beams, + temperature=request.temperature, + top_k=request.top_k, + top_p=request.top_p, + typical_p=request.typical_p, + repitition_penalty=request.repitition_penalty, + bos_token_id=request.bos_token_id, + pad_token_id=request.pad_token_id, + eos_token_id=request.eos_token_id, + length_penalty=request.length_penalty, + no_repeat_ngram_size=request.no_repeat_ngram_size, + encoder_no_repeat_ngram_size=request.encoder_no_repeat_ngram_size, + num_return_sequences=request.num_return_sequences, + max_time=request.max_time, + max_new_tokens=request.max_new_tokens, + decoder_start_token_id=request.decoder_start_token_id, + num_beam_groups=request.num_beam_groups, + diversity_penalty=request.diversity_penalty, + forced_bos_token_id=request.forced_bos_token_id, + forced_eos_token_id=request.forced_eos_token_id, + exponential_decay_length_penalty=request.exponential_decay_length_penalty, + bad_words_ids=request.bad_words_ids, + force_words_ids=request.force_words_ids ).response output_text = [_ for _ in output_text] @@ -65,18 +87,23 @@ def generate(self, input_token_lengths = [len(x) for x in input_tokens] output_token_lengths = [len(x) for x in output_tokens] - generated_tokens = [ + num_generated_tokens = [ o - i for i, o in zip(input_token_lengths, output_token_lengths)] - if (remove_input_from_output): + if (request.remove_input_from_output): output_tokens = [x[-i:] - for x, i in zip(output_tokens, generated_tokens)] + for x, i in zip(output_tokens, num_generated_tokens)] output_text = self.tokenizer.batch_decode( output_tokens, skip_special_tokens=True) - if (return_format == str): - return output_text[0], generated_tokens[0] - return output_text, generated_tokens + if (return_type == str): + output_text = output_text[0] + num_generated_tokens = num_generated_tokens[0] + + return GenerateResponse( + text=output_text, + num_generated_tokens=num_generated_tokens + ) def shutdown(self) -> None: mii.terminate(self.deployment_name) diff --git a/inference/server.py b/inference/server.py index e6d2ed2b6..8876485a4 100644 --- a/inference/server.py +++ b/inference/server.py @@ -1,30 +1,16 @@ import argparse import logging import time -from typing import List, Union import constants import utils from ds_inference import DSInferenceGRPCServer from fastapi import FastAPI, HTTPException from hf_accelerate import HFAccelerateModel -from pydantic import BaseModel -from utils import MaxTokensError, get_argument_parser, parse_generate_kwargs +from utils import GenerateRequest, MaxTokensError, get_argument_parser from uvicorn import run -class GenerateRequest(BaseModel): - text: Union[List[str], str] - generate_kwargs: dict - - -class GenerateResponse(BaseModel): - text: Union[List[str], str] - num_generated_tokens: Union[List[int], int] - query_id: int - total_time_taken: float - - def get_args() -> argparse.Namespace: parser = get_argument_parser() @@ -83,33 +69,18 @@ def generate(request: GenerateRequest) -> dict: try: start_time = time.time() - text = request.text - generate_kwargs = args.generate_kwargs - remove_input_from_output = False - if (request.generate_kwargs): - generate_kwargs, remove_input_from_output = parse_generate_kwargs( - request.generate_kwargs) - - if (generate_kwargs["max_new_tokens"] > args.allowed_max_new_tokens): + if (request.max_new_tokens > args.allowed_max_new_tokens): raise MaxTokensError( - generate_kwargs["max_new_tokens"], args.allowed_max_new_tokens) - - output_text, num_generated_tokens = model.generate( - text, - generate_kwargs, - remove_input_from_output=remove_input_from_output - ) - - total_time_taken = time.time() - start_time - - output = GenerateResponse( - text=output_text, - num_generated_tokens=num_generated_tokens, - query_id=query_id, - total_time_taken=total_time_taken - ) + request.max_new_tokens, + args.allowed_max_new_tokens + ) + + response = model.generate(request) + response.query_id = query_id + response.total_time_taken = time.time() - start_time + query_id += 1 - return output + return response except Exception as e: query_id += 1 raise HTTPException(500, {"error": str(e)}) diff --git a/inference/utils/__init__.py b/inference/utils/__init__.py index f66c16623..063d413cb 100644 --- a/inference/utils/__init__.py +++ b/inference/utils/__init__.py @@ -1,4 +1,5 @@ from .model import Model +from .requests import GenerateRequest, GenerateResponse from .utils import ( Execute, MaxTokensError, diff --git a/inference/utils/model.py b/inference/utils/model.py index f21239524..33da8d114 100644 --- a/inference/utils/model.py +++ b/inference/utils/model.py @@ -1,8 +1,9 @@ import argparse -from typing import List, Tuple, Union import torch +from .requests import GenerateRequest, GenerateResponse + class Model: def __init__(self, args: argparse.Namespace) -> None: @@ -11,11 +12,9 @@ def __init__(self, args: argparse.Namespace) -> None: self.input_device = None raise NotImplementedError("This is a dummy class") - def generate(self, - text: Union[str, List[str]], - generate_kwargs: dict, - remove_input_from_output: bool = False) -> Union[Tuple[str, int], - Tuple[List[str], List[int]]]: + def generate(self, request: GenerateRequest) -> GenerateResponse: + text = request.text + return_type = type(text) if (return_type == str): text = [text] @@ -29,7 +28,32 @@ def generate(self, with torch.no_grad(): output_tokens = self.model.generate( **input_tokens, - **generate_kwargs + min_length=request.min_length, + do_sample=request.do_sample, + early_stopping=request.early_stopping, + num_beams=request.num_beams, + temperature=request.temperature, + top_k=request.top_k, + top_p=request.top_p, + typical_p=request.typical_p, + repitition_penalty=request.repitition_penalty, + bos_token_id=request.bos_token_id, + pad_token_id=request.pad_token_id, + eos_token_id=request.eos_token_id, + length_penalty=request.length_penalty, + no_repeat_ngram_size=request.no_repeat_ngram_size, + encoder_no_repeat_ngram_size=request.encoder_no_repeat_ngram_size, + num_return_sequences=request.num_return_sequences, + max_time=request.max_time, + max_new_tokens=request.max_new_tokens, + decoder_start_token_id=request.decoder_start_token_id, + num_beam_groups=request.num_beam_groups, + diversity_penalty=request.diversity_penalty, + forced_bos_token_id=request.forced_bos_token_id, + forced_eos_token_id=request.forced_eos_token_id, + exponential_decay_length_penalty=request.exponential_decay_length_penalty, + bad_words_ids=request.bad_words_ids, + force_words_ids=request.force_words_ids ) input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] @@ -37,7 +61,7 @@ def generate(self, generated_tokens = [ o - i for i, o in zip(input_token_lengths, output_token_lengths)] - if (remove_input_from_output): + if (request.remove_input_from_output): output_tokens = [x[-i:] for x, i in zip(output_tokens, generated_tokens)] @@ -48,7 +72,10 @@ def generate(self, output_text = output_text[0] generated_tokens = generated_tokens[0] - return output_text, generated_tokens + return GenerateResponse( + text=output_text, + num_generated_tokens=generated_tokens + ) def shutdown(self) -> None: exit() diff --git a/inference/utils/requests.py b/inference/utils/requests.py new file mode 100644 index 000000000..2beac1ae3 --- /dev/null +++ b/inference/utils/requests.py @@ -0,0 +1,98 @@ +from typing import List, Union + +from pydantic import BaseModel + + +def parse_bool(value: str) -> bool: + if (value.lower() == "true"): + return True + elif (value.lower() == "false"): + return False + else: + raise ValueError("{} is not a valid boolean value".format(value)) + + +def parse_field(kwargs: dict, field: str, dtype: int, default_value: Any = None) -> Any: + if (field in kwargs): + if (dtype == bool): + return parse_bool(kwargs[field]) + else: + return dtype(kwargs[field]) + else: + return default_value + + +class GenerateRequest(BaseModel): + text: Union[List[str], str] + min_length: int = None + do_sample: bool = None + early_stopping: bool = None + num_beams: int = None + temperature: float = None + top_k: int = None + top_p: float = None + typical_p: float = None + repitition_penalty: float = None + bos_token_id: int = None + pad_token_id: int = None + eos_token_id: int = None + length_penalty: float = None + no_repeat_ngram_size: int = None + encoder_no_repeat_ngram_size: int = None + num_return_sequences: int = None + max_time: float = None + max_new_tokens: int = None + decoder_start_token_id: int = None + num_beam_groups: int = None + diversity_penalty: float = None + forced_bos_token_id: int = None + forced_eos_token_id: int = None + exponential_decay_length_penalty: float = None + bad_words_ids: List[int] = None + force_words_ids: Union[List[int], List[List[int]]] = None + remove_input_from_output: bool = False + + def __init__(self, text: Union[List[str], str], kwargs: dict) -> None: + self.text = text + self.min_length = parse_field(kwargs, "min_length", int) + self.do_sample = parse_field(kwargs, "do_sample", bool) + self.early_stopping = parse_field(kwargs, "early_stopping", bool) + self.num_beams = parse_field(kwargs, "num_beams", int) + self.temperature = parse_field(kwargs, "temperature", float) + self.top_k = parse_field(kwargs, "top_k", int) + self.top_p = parse_field(kwargs, "top_p", float) + self.typical_p = parse_field(kwargs, "typical_p", float) + self.repitition_penalty = parse_field( + kwargs, "repitition_penalty", float) + self.bos_token_id = parse_field(kwargs, "bos_token_id", int) + self.pad_token_id = parse_field(kwargs, "pad_token_id", int) + self.eos_token_id = parse_field(kwargs, "eos_token_id", int) + self.length_penalty = parse_field(kwargs, "length_penalty", float) + self.no_repeat_ngram_size = parse_field( + kwargs, "no_repeat_ngram_size", int) + self.encoder_no_repeat_ngram_size = parse_field( + kwargs, "encoder_no_repeat_ngram_size", int) + self.num_return_sequences = parse_field( + kwargs, "num_return_sequences", int) + self.max_time = parse_field(kwargs, "max_time", float) + self.max_new_tokens = parse_field(kwargs, "max_new_tokens", int) + self.decoder_start_token_id = parse_field( + kwargs, "decoder_start_token_id", int) + self.num_beam_group = parse_field(kwargs, "num_beam_group", int) + self.diversity_penalty = parse_field( + kwargs, "diversity_penalty", float) + self.forced_bos_token_id = parse_field( + kwargs, "forced_bos_token_id", int) + self.forced_eos_token_id = parse_field( + kwargs, "forced_eos_token_id", int) + self.exponential_decay_length_penalty = parse_field( + kwargs, "exponential_decay_length_penalty", float), + self.remove_input_from_output = parse_field( + kwargs, "remove_input_from_output", bool, False) + + +class GenerateResponse(BaseModel): + text: Union[List[str], str] = None + num_generated_tokens: Union[List[int], int] = None + query_id: int = None + total_time_taken: float = None diff --git a/inference/utils/utils.py b/inference/utils/utils.py index d294cd5d7..e9f6f73bc 100644 --- a/inference/utils/utils.py +++ b/inference/utils/utils.py @@ -6,6 +6,8 @@ import torch import torch.distributed as dist +from .requests import GenerateRequest + dummy_input_sentences = [ "DeepSpeed is a machine learning framework", @@ -111,108 +113,3 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[ input_sentences = input_sentences[:batch_size] return input_sentences - - -def parse_bool(value: str) -> bool: - if (value.lower() == "true"): - return True - elif (value.lower() == "false"): - return False - else: - raise ValueError("{} is not a valid boolean value".format(value)) - - -def parse_generate_kwargs(kwargs: dict) -> dict: - if ("min_length" in kwargs): - kwargs["min_length"] = int(kwargs["min_length"]) - if ("do_sample" in kwargs): - kwargs["do_sample"] = parse_bool(kwargs["do_sample"]) - if ("early_stopping" in kwargs): - kwargs["early_stopping"] = parse_bool(kwargs["early_stopping"]) - if ("num_beams" in kwargs): - kwargs["num_beams"] = int(kwargs["num_beams"]) - if ("temperature" in kwargs): - kwargs["temperature"] = float(kwargs["temperature"]) - if ("top_k" in kwargs): - kwargs["top_k"] = int(kwargs["top_k"]) - if ("top_p" in kwargs): - kwargs["top_p"] = float(kwargs["top_p"]) - if ("typical_p" in kwargs): - kwargs["typical_p"] = float(kwargs["typical_p"]) - if ("repitition_penalty" in kwargs): - kwargs["repitition_penalty"] = float(kwargs["repitition_penalty"]) - if ("bos_token_id" in kwargs): - kwargs["bos_token_id"] = int(kwargs["bos_token_id"]) - if ("pad_token_id" in kwargs): - kwargs["pad_token_id"] = int(kwargs["pad_token_id"]) - if ("eos_token_id" in kwargs): - kwargs["eos_token_id"] = int(kwargs["eos_token_id"]) - if ("length_penalty" in kwargs): - kwargs["length_penalty"] = float(kwargs["length_penalty"]) - if ("no_repeat_ngram_size" in kwargs): - kwargs["no_repeat_ngram_size"] = int(kwargs["no_repeat_ngram_size"]) - if ("encoder_no_repeat_ngram_size" in kwargs): - kwargs["encoder_no_repeat_ngram_size"] = int( - kwargs["encoder_no_repeat_ngram_size"]) - if ("num_return_sequences" in kwargs): - kwargs["num_return_sequences"] = int(kwargs["num_return_sequences"]) - if ("max_time" in kwargs): - kwargs["max_time"] = float(kwargs["max_time"]) - if ("max_new_tokens" in kwargs): - kwargs["max_new_tokens"] = int(kwargs["max_new_tokens"]) - if ("decoder_start_token_id" in kwargs): - kwargs["decoder_start_token_id"] = int( - kwargs["decoder_start_token_id"]) - if ("num_beam_groups" in kwargs): - kwargs["num_beam_groups"] = int(kwargs["num_beam_groups"]) - if ("diversity_penalty" in kwargs): - kwargs["diversity_penalty"] = float(kwargs["diversity_penalty"]) - if ("forced_bos_token_id" in kwargs): - kwargs["forced_bos_token_id"] = int(kwargs["forced_bos_token_id"]) - if ("forced_eos_token_id" in kwargs): - kwargs["forced_eos_token_id"] = int(kwargs["forced_eos_token_id"]) - if ("exponential_decay_length_penalty" in kwargs): - kwargs["exponential_decay_length_penalty"] = float( - kwargs["exponential_decay_length_penalty"]) - - # i was being lazy :) - if ("bad_words_ids" in kwargs): - del kwargs["bad_words_ids"] - if ("force_words_ids" in kwargs): - del kwargs["force_words_ids"] - - # so people don't slow down the server - if ("use_cache" in kwargs): - del kwargs["use_cache"] - if ("remove_invalid_values" in kwargs): - del kwargs["remove_invalid_values"] - if ("synced_gpus" in kwargs): - del kwargs["synced_gpus"] - if ("max_length" in kwargs): - del kwargs["max_length"] - - # no idea how to support this in a server setting - if ("prefix_allowed_tokens_fn" in kwargs): - del kwargs["prefix_allowed_tokens_fn"] - if ("logits_processor" in kwargs): - del kwargs["logits_processor"] - if ("renormalize_logits" in kwargs): - del kwargs["renormalize_logits"] - if ("stopping_criteria" in kwargs): - del kwargs["stopping_criteria"] - if ("constraints" in kwargs): - del kwargs["constraints"] - if ("output_attentions" in kwargs): - del kwargs["output_attentions"] - if ("output_hidden_states" in kwargs): - del kwargs["output_hidden_states"] - if ("output_scores" in kwargs): - del kwargs["output_scores"] - if ("return_dict_in_generate" in kwargs): - del kwargs["return_dict_in_generate"] - - remove_input_from_output = False - if ("remove_input_from_output" in kwargs): - remove_input_from_output = parse_bool(kwargs["remove_input_from_output"]) - - return kwargs, remove_input_from_output From 46ade324888c2280406d6c7e8cd25777fc04aed8 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 00:14:18 +0530 Subject: [PATCH 10/32] remove MaxTokensError --- inference/benchmark.py | 23 +++-- inference/cli.py | 9 +- inference/ds_inference/grpc_server.py | 29 +----- inference/server.py | 9 +- inference/utils/__init__.py | 5 +- inference/utils/requests.py | 127 ++++++++++++++------------ inference/utils/utils.py | 14 +-- 7 files changed, 101 insertions(+), 115 deletions(-) diff --git a/inference/benchmark.py b/inference/benchmark.py index 31af2a895..3f2e61241 100644 --- a/inference/benchmark.py +++ b/inference/benchmark.py @@ -12,7 +12,15 @@ from ds_inference import DSInferenceModel from ds_zero import DSZeROModel from hf_accelerate import HFAccelerateModel -from utils import Execute, Model, get_argument_parser, get_dummy_batch, print_rank_n, GenerateRequest +from utils import ( + Execute, + GenerateRequest, + Model, + get_argument_parser, + get_dummy_batch, + parse_generate_kwargs, + print_rank_n +) def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]: @@ -67,17 +75,16 @@ def benchmark_end_to_end(args: argparse.Namespace, Execute(model_class, {"args": args}) ) - print_rank_n( - f"*** Starting to generate {args.generate_kwargs['max_new_tokens']} tokens with bs={args.batch_size}") - - input_sentences = get_dummy_batch(args.batch_size) + request = parse_generate_kwargs( + get_dummy_batch(args.batch_size), + args.generate_kwargs + ) - print_rank_n(f"Generate args {args.generate_kwargs}") + print_rank_n(f"generate_kwargs = {request}") + print_rank_n(f"batch_size = {args.batch_size}") # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - request = GenerateRequest(input_sentences, args.generate_kwargs) - response, generation_time = run_and_log_time( Execute(model.generate, {"request": request})) diff --git a/inference/cli.py b/inference/cli.py index e54a0b192..742f07b0e 100644 --- a/inference/cli.py +++ b/inference/cli.py @@ -1,8 +1,6 @@ import argparse import json -import deepspeed - import constants import utils from ds_inference import DSInferenceGRPCServer @@ -48,7 +46,6 @@ def main() -> None: f"Unknown deployment framework {args.deployment_framework}") generate_kwargs = args.generate_kwargs - request = parse_generate_kwargs(generate_kwargs) while (True): # currently only 1 process is running so its @@ -61,11 +58,9 @@ def main() -> None: model.shutdown() if (input("change generate_kwargs? [y/n] ") == "y"): - generate_kwargs = input("Generate kwargs: ") - generate_kwargs = json.loads(generate_kwargs) - request = parse_generate_kwargs(generate_kwargs) + generate_kwargs = json.loads(input("generate_kwargs: ")) - request.text = input_text + request = parse_generate_kwargs(input_text, generate_kwargs) response = model.generate(request) print_rank_n("Output text:", response.text) diff --git a/inference/ds_inference/grpc_server.py b/inference/ds_inference/grpc_server.py index e1eece44a..30cace136 100644 --- a/inference/ds_inference/grpc_server.py +++ b/inference/ds_inference/grpc_server.py @@ -6,7 +6,7 @@ from transformers import AutoTokenizer import mii -from utils import GenerateRequest, GenerateResponse, Model, get_str_dtype +from utils import GenerateRequest, GenerateResponse, Model, get_filter_dict, get_str_dtype class DSInferenceGRPCServer(Model): @@ -51,32 +51,7 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: output_text = self.model.query( {"query": text}, - min_length=request.min_length, - do_sample=request.do_sample, - early_stopping=request.early_stopping, - num_beams=request.num_beams, - temperature=request.temperature, - top_k=request.top_k, - top_p=request.top_p, - typical_p=request.typical_p, - repitition_penalty=request.repitition_penalty, - bos_token_id=request.bos_token_id, - pad_token_id=request.pad_token_id, - eos_token_id=request.eos_token_id, - length_penalty=request.length_penalty, - no_repeat_ngram_size=request.no_repeat_ngram_size, - encoder_no_repeat_ngram_size=request.encoder_no_repeat_ngram_size, - num_return_sequences=request.num_return_sequences, - max_time=request.max_time, - max_new_tokens=request.max_new_tokens, - decoder_start_token_id=request.decoder_start_token_id, - num_beam_groups=request.num_beam_groups, - diversity_penalty=request.diversity_penalty, - forced_bos_token_id=request.forced_bos_token_id, - forced_eos_token_id=request.forced_eos_token_id, - exponential_decay_length_penalty=request.exponential_decay_length_penalty, - bad_words_ids=request.bad_words_ids, - force_words_ids=request.force_words_ids + **get_filter_dict(request) ).response output_text = [_ for _ in output_text] diff --git a/inference/server.py b/inference/server.py index 8876485a4..2829de3d5 100644 --- a/inference/server.py +++ b/inference/server.py @@ -7,7 +7,7 @@ from ds_inference import DSInferenceGRPCServer from fastapi import FastAPI, HTTPException from hf_accelerate import HFAccelerateModel -from utils import GenerateRequest, MaxTokensError, get_argument_parser +from utils import GenerateRequest, get_argument_parser, get_num_tokens_to_generate from uvicorn import run @@ -69,11 +69,8 @@ def generate(request: GenerateRequest) -> dict: try: start_time = time.time() - if (request.max_new_tokens > args.allowed_max_new_tokens): - raise MaxTokensError( - request.max_new_tokens, - args.allowed_max_new_tokens - ) + request.max_new_tokens = get_num_tokens_to_generate( + request.max_new_tokens, args.allowed_max_new_tokens) response = model.generate(request) response.query_id = query_id diff --git a/inference/utils/__init__.py b/inference/utils/__init__.py index 063d413cb..77bb4ceaa 100644 --- a/inference/utils/__init__.py +++ b/inference/utils/__init__.py @@ -1,13 +1,12 @@ from .model import Model -from .requests import GenerateRequest, GenerateResponse +from .requests import GenerateRequest, GenerateResponse, get_filter_dict, parse_generate_kwargs from .utils import ( Execute, - MaxTokensError, get_args, get_argument_parser, get_dummy_batch, + get_num_tokens_to_generate, get_str_dtype, - parse_generate_kwargs, print_rank_n, run_rank_n ) diff --git a/inference/utils/requests.py b/inference/utils/requests.py index 2beac1ae3..2bf5aff07 100644 --- a/inference/utils/requests.py +++ b/inference/utils/requests.py @@ -1,27 +1,8 @@ -from typing import List, Union +from typing import Any, List, Union from pydantic import BaseModel -def parse_bool(value: str) -> bool: - if (value.lower() == "true"): - return True - elif (value.lower() == "false"): - return False - else: - raise ValueError("{} is not a valid boolean value".format(value)) - - -def parse_field(kwargs: dict, field: str, dtype: int, default_value: Any = None) -> Any: - if (field in kwargs): - if (dtype == bool): - return parse_bool(kwargs[field]) - else: - return dtype(kwargs[field]) - else: - return default_value - - class GenerateRequest(BaseModel): text: Union[List[str], str] min_length: int = None @@ -52,47 +33,77 @@ class GenerateRequest(BaseModel): force_words_ids: Union[List[int], List[List[int]]] = None remove_input_from_output: bool = False - def __init__(self, text: Union[List[str], str], kwargs: dict) -> None: - self.text = text - self.min_length = parse_field(kwargs, "min_length", int) - self.do_sample = parse_field(kwargs, "do_sample", bool) - self.early_stopping = parse_field(kwargs, "early_stopping", bool) - self.num_beams = parse_field(kwargs, "num_beams", int) - self.temperature = parse_field(kwargs, "temperature", float) - self.top_k = parse_field(kwargs, "top_k", int) - self.top_p = parse_field(kwargs, "top_p", float) - self.typical_p = parse_field(kwargs, "typical_p", float) - self.repitition_penalty = parse_field( - kwargs, "repitition_penalty", float) - self.bos_token_id = parse_field(kwargs, "bos_token_id", int) - self.pad_token_id = parse_field(kwargs, "pad_token_id", int) - self.eos_token_id = parse_field(kwargs, "eos_token_id", int) - self.length_penalty = parse_field(kwargs, "length_penalty", float) - self.no_repeat_ngram_size = parse_field( - kwargs, "no_repeat_ngram_size", int) - self.encoder_no_repeat_ngram_size = parse_field( - kwargs, "encoder_no_repeat_ngram_size", int) - self.num_return_sequences = parse_field( - kwargs, "num_return_sequences", int) - self.max_time = parse_field(kwargs, "max_time", float) - self.max_new_tokens = parse_field(kwargs, "max_new_tokens", int) - self.decoder_start_token_id = parse_field( - kwargs, "decoder_start_token_id", int) - self.num_beam_group = parse_field(kwargs, "num_beam_group", int) - self.diversity_penalty = parse_field( - kwargs, "diversity_penalty", float) - self.forced_bos_token_id = parse_field( - kwargs, "forced_bos_token_id", int) - self.forced_eos_token_id = parse_field( - kwargs, "forced_eos_token_id", int) - self.exponential_decay_length_penalty = parse_field( - kwargs, "exponential_decay_length_penalty", float), - self.remove_input_from_output = parse_field( - kwargs, "remove_input_from_output", bool, False) - class GenerateResponse(BaseModel): text: Union[List[str], str] = None num_generated_tokens: Union[List[int], int] = None query_id: int = None total_time_taken: float = None + + +def parse_bool(value: str) -> bool: + if (value.lower() == "true"): + return True + elif (value.lower() == "false"): + return False + else: + raise ValueError("{} is not a valid boolean value".format(value)) + + +def parse_field(kwargs: dict, + field: str, + dtype: int, + default_value: Any = None) -> Any: + if (field in kwargs): + if (type(kwargs[field]) == dtype): + return kwargs[field] + elif (dtype == bool): + return parse_bool(kwargs[field]) + else: + return dtype(kwargs[field]) + else: + return default_value + + +def parse_generate_kwargs(text: Union[List[str], str], kwargs: dict) -> GenerateRequest: + return GenerateRequest( + text=text, + min_length=parse_field(kwargs, "min_length", int), + do_sample=parse_field(kwargs, "do_sample", bool), + early_stopping=parse_field(kwargs, "early_stopping", bool), + num_beams=parse_field(kwargs, "num_beams", int), + temperature=parse_field(kwargs, "temperature", float), + top_k=parse_field(kwargs, "top_k", int), + top_p=parse_field(kwargs, "top_p", float), + typical_p=parse_field(kwargs, "typical_p", float), + repitition_penalty=parse_field(kwargs, "repitition_penalty", float), + bos_token_id=parse_field(kwargs, "bos_token_id", int), + pad_token_id=parse_field(kwargs, "pad_token_id", int), + eos_token_id=parse_field(kwargs, "eos_token_id", int), + length_penalty=parse_field(kwargs, "length_penalty", float), + no_repeat_ngram_size=parse_field(kwargs, "no_repeat_ngram_size", int), + encoder_no_repeat_ngram_size=parse_field( + kwargs, "encoder_no_repeat_ngram_size", int), + num_return_sequences=parse_field(kwargs, "num_return_sequences", int), + max_time=parse_field(kwargs, "max_time", float), + max_new_tokens=parse_field(kwargs, "max_new_tokens", int), + decoder_start_token_id=parse_field( + kwargs, "decoder_start_token_id", int), + num_beam_group=parse_field(kwargs, "num_beam_group", int), + diversity_penalty=parse_field(kwargs, "diversity_penalty", float), + forced_bos_token_id=parse_field(kwargs, "forced_bos_token_id", int), + forced_eos_token_id=parse_field(kwargs, "forced_eos_token_id", int), + exponential_decay_length_penalty=parse_field( + kwargs, "exponential_decay_length_penalty", float), + remove_input_from_output=parse_field( + kwargs, "remove_input_from_output", bool, False) + ) + + +def get_filter_dict(d: BaseModel) -> dict: + d = dict(d) + q = {} + for i in d: + if (d[i] != None): + q[i] = d[i] + return q diff --git a/inference/utils/utils.py b/inference/utils/utils.py index e9f6f73bc..d68cb026b 100644 --- a/inference/utils/utils.py +++ b/inference/utils/utils.py @@ -21,12 +21,6 @@ ] -class MaxTokensError(Exception): - def __init__(self, max_new_tokens: int, allowed_max_new_tokens: int) -> None: - super().__init__("max_new_tokens = {} > {} is not supported.".format( - max_new_tokens, allowed_max_new_tokens)) - - class Execute: def __init__(self, func: callable, kwargs: dict) -> None: self.func = func @@ -113,3 +107,11 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[ input_sentences = input_sentences[:batch_size] return input_sentences + + +def get_num_tokens_to_generate(max_new_tokens: int, + allowed_max_new_tokens: int) -> int: + if (max_new_tokens == None): + return allowed_max_new_tokens + else: + return min(max_new_tokens, allowed_max_new_tokens) From c46d95764d39147171fc6fd1a9920a9aa5610892 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 03:16:43 +0530 Subject: [PATCH 11/32] fix batch size error with DS inference server --- inference/cli.py | 2 +- inference/server.py | 30 ++++++++++++++++++++++++++++-- inference/utils/requests.py | 1 + 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/inference/cli.py b/inference/cli.py index 742f07b0e..3ab5e356e 100644 --- a/inference/cli.py +++ b/inference/cli.py @@ -58,7 +58,7 @@ def main() -> None: model.shutdown() if (input("change generate_kwargs? [y/n] ") == "y"): - generate_kwargs = json.loads(input("generate_kwargs: ")) + generate_kwargs = json.loads(input("Generate kwargs: ")) request = parse_generate_kwargs(input_text, generate_kwargs) response = model.generate(request) diff --git a/inference/server.py b/inference/server.py index 2829de3d5..a23e1bbe6 100644 --- a/inference/server.py +++ b/inference/server.py @@ -1,6 +1,8 @@ import argparse import logging +import sys import time +import traceback import constants import utils @@ -33,6 +35,8 @@ def get_args() -> argparse.Namespace: help="number of http workers") group.add_argument("--allowed_max_new_tokens", type=int, default=100, help="max allowed tokens") + group.add_argument("--debug", action="store_true", + help="launch in debug mode") args = utils.get_args(parser) @@ -61,6 +65,18 @@ def get_args() -> argparse.Namespace: #################################################################################### +def get_stack_trace(e_stack_trace): + trace_back = traceback.extract_tb(e_stack_trace) + + # Format stacktrace + stack_trace = [] + for trace in trace_back: + stack_trace.append("File : {}, Line : {}, Func.Name : {}, Message : {}".format( + trace[0], trace[1], trace[2], trace[3])) + + return stack_trace + + @app.post("/generate/") def generate(request: GenerateRequest) -> dict: # needs to be global since it is updated @@ -78,9 +94,19 @@ def generate(request: GenerateRequest) -> dict: query_id += 1 return response - except Exception as e: + except Exception: + e_type, e_message, e_stack_trace = sys.exc_info() + response = { + "error": str(e_type.__name__), + "message": str(e_message), + "query_id": query_id + } + + if (args.debug): + response["stack_trace"] = get_stack_trace(e_stack_trace), + query_id += 1 - raise HTTPException(500, {"error": str(e)}) + raise HTTPException(500, response) run(app, host=args.host, port=args.port, workers=args.workers) diff --git a/inference/utils/requests.py b/inference/utils/requests.py index 2bf5aff07..b8d8965e1 100644 --- a/inference/utils/requests.py +++ b/inference/utils/requests.py @@ -106,4 +106,5 @@ def get_filter_dict(d: BaseModel) -> dict: for i in d: if (d[i] != None): q[i] = d[i] + del q["text"] return q From f3dac05adf014a592c15184b2db5eccde33b368c Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 03:51:14 +0530 Subject: [PATCH 12/32] type fix --- inference/benchmark.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference/benchmark.py b/inference/benchmark.py index 3f2e61241..6fd2ddd04 100644 --- a/inference/benchmark.py +++ b/inference/benchmark.py @@ -2,7 +2,7 @@ import gc import os import time -from typing import Any, List, Union +from typing import Any, List, Tuple, Union import deepspeed import torch @@ -23,7 +23,7 @@ ) -def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]: +def run_and_log_time(execs: Union[List[Execute], Execute]) -> Tuple[Union[List[Any], Any], float]: """ runs a list of Execute objects and returns a list of outputs and the time taken """ From 44256140e3a05991e546c3bc9b49ecc3c6233f40 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 04:17:32 +0530 Subject: [PATCH 13/32] add latency --- inference/benchmark.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/inference/benchmark.py b/inference/benchmark.py index 6fd2ddd04..4a78191d4 100644 --- a/inference/benchmark.py +++ b/inference/benchmark.py @@ -53,9 +53,9 @@ def benchmark_generation(model: Model, def get_benchmark_results(benchmark_time: float, initialization_time: float, - generation_time: float, total_new_tokens_generated: int, - batch_size: int) -> str: + batch_size: int, + cycles: int) -> str: throughput = total_new_tokens_generated / benchmark_time return f""" *** Performance stats: @@ -63,7 +63,7 @@ def get_benchmark_results(benchmark_time: float, Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token Model loading time = {initialization_time:.2f} secs Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} -Generation time per batch = {generation_time:.2f} secs +Latency = {benchmark_time / cycles:.2f} secs Model loading time + generation time per batch = {initialization_time + generation_time:.2f} secs """ @@ -85,8 +85,7 @@ def benchmark_end_to_end(args: argparse.Namespace, # warmup is a must if measuring speed as it's when all the optimizations are performed # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - response, generation_time = run_and_log_time( - Execute(model.generate, {"request": request})) + response = model.generate(request) for i, (o, _) in zip(request.text, zip(response.text, response.num_generated_tokens)): print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n") @@ -122,9 +121,9 @@ def benchmark_end_to_end(args: argparse.Namespace, get_benchmark_results( benchmark_time, initialization_time, - generation_time, total_new_tokens_generated, - args.batch_size + args.batch_size, + args.benchmark_cycles ) ) From c97d6ea671a6426960f3554800152b7e2b87f965 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 04:50:24 +0530 Subject: [PATCH 14/32] add latency --- inference/benchmark.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/inference/benchmark.py b/inference/benchmark.py index 4a78191d4..f2c60258d 100644 --- a/inference/benchmark.py +++ b/inference/benchmark.py @@ -57,14 +57,15 @@ def get_benchmark_results(benchmark_time: float, batch_size: int, cycles: int) -> str: throughput = total_new_tokens_generated / benchmark_time + latency = benchmark_time / cycles return f""" *** Performance stats: Throughput (including tokenization) = {throughput:.2f} tokens/sec Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token Model loading time = {initialization_time:.2f} secs Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} -Latency = {benchmark_time / cycles:.2f} secs -Model loading time + generation time per batch = {initialization_time + generation_time:.2f} secs +Latency = {latency:.2f} secs +Model loading time + generation time per batch = {initialization_time + latency:.2f} secs """ From f3385f2b49a46725a771e8f00069e4c2580ae8ad Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 16:16:28 +0530 Subject: [PATCH 15/32] add min_length to default kwargs --- inference/utils/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/inference/utils/utils.py b/inference/utils/utils.py index d68cb026b..1d8c39c2c 100644 --- a/inference/utils/utils.py +++ b/inference/utils/utils.py @@ -42,6 +42,7 @@ def get_argument_parser() -> argparse.ArgumentParser: "--generate_kwargs", type=dict, default={ + "min_length": 100, "max_new_tokens": 100, "do_sample": False }, From 8f25200b84cc76ffd3743bca2e88adcf7141d6ac Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 16:48:38 +0530 Subject: [PATCH 16/32] str kwargs --- inference/utils/utils.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/inference/utils/utils.py b/inference/utils/utils.py index 1d8c39c2c..c29c8b7d9 100644 --- a/inference/utils/utils.py +++ b/inference/utils/utils.py @@ -1,5 +1,6 @@ import argparse import copy +import json import math from typing import Any, List @@ -40,12 +41,8 @@ def get_argument_parser() -> argparse.ArgumentParser: choices=["bf16", "fp16"], help="dtype for model") group.add_argument( "--generate_kwargs", - type=dict, - default={ - "min_length": 100, - "max_new_tokens": 100, - "do_sample": False - }, + type=str, + default='{"min_length": 100, "max_new_tokens": 100, "do_sample": False}', help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters" ) @@ -55,6 +52,7 @@ def get_argument_parser() -> argparse.ArgumentParser: def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args = parser.parse_args() args.dtype = get_torch_dtype(args.dtype) + args.generate_kwargs = json.loads(args.generate_kwargs) return args From b11bb7fad53db66aceb0cea8b6356fafd9bf5e41 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 17 Aug 2022 16:58:35 +0530 Subject: [PATCH 17/32] str kwargs --- inference/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/utils/utils.py b/inference/utils/utils.py index c29c8b7d9..20eb393ac 100644 --- a/inference/utils/utils.py +++ b/inference/utils/utils.py @@ -42,7 +42,7 @@ def get_argument_parser() -> argparse.ArgumentParser: group.add_argument( "--generate_kwargs", type=str, - default='{"min_length": 100, "max_new_tokens": 100, "do_sample": False}', + default='{"min_length": 100, "max_new_tokens": 100, "do_sample": false}', help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters" ) From 99dedb03bb4cb032f7207d60991d65738a6ac36c Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 18 Aug 2022 10:35:32 +0530 Subject: [PATCH 18/32] fix comma --- inference/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/server.py b/inference/server.py index a23e1bbe6..b35e6274c 100644 --- a/inference/server.py +++ b/inference/server.py @@ -103,7 +103,7 @@ def generate(request: GenerateRequest) -> dict: } if (args.debug): - response["stack_trace"] = get_stack_trace(e_stack_trace), + response["stack_trace"] = get_stack_trace(e_stack_trace) query_id += 1 raise HTTPException(500, response) From 497f00ef146b354b974ecc24199171a4a7850c64 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 18 Aug 2022 12:44:29 +0530 Subject: [PATCH 19/32] add old scripts back --- scripts/inference/README.md | 195 ++++++++++++ .../inference/bloom-accelerate-inference.py | 186 +++++++++++ scripts/inference/bloom-ds-inference.py | 299 ++++++++++++++++++ scripts/inference/bloom-ds-zero-inference.py | 211 ++++++++++++ 4 files changed, 891 insertions(+) create mode 100644 scripts/inference/README.md create mode 100644 scripts/inference/bloom-accelerate-inference.py create mode 100644 scripts/inference/bloom-ds-inference.py create mode 100644 scripts/inference/bloom-ds-zero-inference.py diff --git a/scripts/inference/README.md b/scripts/inference/README.md new file mode 100644 index 000000000..44e98f9fb --- /dev/null +++ b/scripts/inference/README.md @@ -0,0 +1,195 @@ +# Inference scripts for BLOOM + +## BLOOM Inference solutions + +Here are some stats on JeanZay's 8x80GB A100 node w/ 512GB of CPU memory: + +All benchmarks are doing greedy generation of 100 token outputs: +``` +Generate args {'min_length': 100, 'max_length': 100, 'do_sample': False} +``` +The inputs are just a few tokens. + +Throughput in msecs: + +| project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | +| :----------- | :---- | :---- | :---- | :---- | :---- | :--- | +| accelerate | 230.38 | 31.78 | 17.84 | 10.89 | oom | omm | +| ds-inference | 40.57 | 5.23 | | | 2.77 | 0.66 | +| ds-zero | 283 | 34.88 | oom | oom | oom | oom | + + +Start to ready to generate in secs: + +| project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | +| :----------- | :--- | :--- | :--- | :--- | :--- | :--- | +| accelerate | 121 | 120 | 113 | 118 | | | +| ds-inference | 662 | 673 | | | 685 | 654 | +| ds-zero | 462 | 463 | | | | | +| | | | | | | | + + +DS-Inference load time (start to ready to generate) will become much faster soon. Once we stop relying on ds-zero to instantiate the model on gpu. The plan is to pre-shard the weights TP-wise for 8x and 16x gpus and load them directly on each gpu. Will probably be under 1min. + + +## Deepspeed-Inference + +Tensor-Parallelism and efficient fused CUDA kernels: +https://www.deepspeed.ai/tutorials/inference-tutorial/ + +### Setup + +``` +git clone https://github.com/microsoft/DeepSpeed +cd DeepSpeed +pip install . +``` + +### Run + +``` +deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom +``` + +Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay) - just a batch of 1 (would be more efficient to run a larger batch) + +Adding `--benchmark` to activate the benchmarks + + +BS=1 +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-inference_bs=1.txt +[...] + +``` + +While processing memory per process: + +- GPU: ~50GB +- CPU: ~10GB + + +BS=8 +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-inference_bs=8.txt +[...] +*** Performance stats: +Throughput per token including tokenize: 5.23 msecs +Start to ready to generate: 683.397 secs +Tokenize and generate 800 (bs=8) tokens: 4.241 secs +Start to finish: 687.638 secs +``` + +BS=64 + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 64 --benchmark 2>&1 | tee bloom-ds-inference_bs=64.txt + + + + +``` + +BS=128 + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 128 --benchmark 2>&1 | tee bloom-ds-inference_bs=128.txt + + + + +``` + +## Deepspeed ZeRO-Inference + +https://www.deepspeed.ai/tutorials/zero/ + +### Setup + +``` +pip install deepspeed +``` + + +### Run + +Note that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference. + + +BS=1 + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt +[...] +*** Performance stats: +Throughput per token including tokenize: 282.93 msecs +Start to ready to generate: 501.871 secs +Tokenize and generate 800 (bs=1) tokens: 226.188 secs +Start to finish: 728.060 secs +``` + + +BS=8 + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt +[...] + +*** Performance stats: +Throughput per token including tokenize: 34.57 msecs +Start to ready to generate: 482.132 secs +Tokenize and generate 6400 (bs=8) tokens: 221.236 secs +Start to finish: 703.368 secs +``` + +BS=16 and higher OOMs + +``` +$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt +[...] +OOM + +``` + + + +## HF Accelerate + +https://github.com/huggingface/accelerate + +### Setup + +``` +pip install transformers +``` + + + +### Run + + + + +BS=1 +``` +$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt +[...] + + +``` + +BS=8 +``` +$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt +[...] + + +``` + +BS=16 +``` +$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt +[...] + + +``` diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/inference/bloom-accelerate-inference.py new file mode 100644 index 000000000..415b2f765 --- /dev/null +++ b/scripts/inference/bloom-accelerate-inference.py @@ -0,0 +1,186 @@ +import argparse +import time +import os +import gc +import torch +import math +from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") + parser.add_argument("--name", type=str, help="Name path", required=True) + parser.add_argument("--batch_size", default=1, type=int, help="batch size") + parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") + parser.add_argument("--greedy", action="store_true") + parser.add_argument("--top-k", type=int, default=0) + parser.add_argument("--top-p", type=float, default=0.) + + return parser.parse_args() + +def get_max_memory_per_gpu_dict(dtype, model_name): + """ try to generate the memory map based on what we know about the model and the available hardware """ + + # figure out the memory map - the minimum per gpu required to load the model + n_gpus = torch.cuda.device_count() + + if model_name == "bigscience/bloom" and n_gpus == 8 and torch.cuda.get_device_properties(0).total_memory > 79*2**30: + # hand crafted optimized memory map for 8x80 setup over BLOOM + # this works with bs=40 + return {0: '0GIB', 1: '51GIB', 2: '51GIB', 3: '51GIB', 4: '51GIB', 5: '51GIB', 6: '51GIB', 7: '51GIB'} + + try: + # model_params calculation, as we don't have a model yet to do: + #model_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + + config = AutoConfig.from_pretrained(model_name) + h = config.n_embed + l = config.n_layer + v = config.vocab_size + # from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing + model_params = l*(12*h**2 + 13*h) + v*h + 4*h + except: + print(f"The model {model_name} has a broken config file. Please notify the owner") + raise + + bytes = torch.finfo(dtype).bits / 8 + param_memory_total_in_bytes = model_params * bytes + # add 5% since weight sizes aren't the same and some GPU may need more memory + param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.05) + print(f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights") + + # check the real available memory + # load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB) + torch.ones(1).cuda() + max_memory_per_gpu_in_bytes = torch.cuda.mem_get_info(0)[0] + if max_memory_per_gpu_in_bytes < param_memory_per_gpu_in_bytes: + raise ValueError(f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)") + + return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())} + +t_start = time.time() + +num_tokens = 100 + +args = get_args() + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + +rank = local_rank + +model_name = args.name +if rank == 0: + print(f"Loading model {model_name}") + + +tokenizer = AutoTokenizer.from_pretrained(model_name) + +# XXX: can't automatically derive dtype via config's `from_pretrained` +dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 + +#print(get_max_memory_per_gpu_dict()) + + +model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map="auto", + max_memory=get_max_memory_per_gpu_dict(dtype, model_name), + torch_dtype=dtype, +) + + +if args.benchmark: + t_ready = time.time() + + + +### Generate + +if rank == 0: + print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") + +input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way" +] + +if args.batch_size > len(input_sentences): + # dynamically extend to support larger bs by repetition + input_sentences *= math.ceil(args.batch_size / len(input_sentences)) + +generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) +#generate_kwargs = dict(max_new_tokens=num_tokens, use_cache=False, do_sample=False) +#generate_kwargs = dict(min_length=num_tokens, max_length=num_tokens, do_sample=False) + +if rank == 0: + print(f"Generate args {generate_kwargs}") +inputs = input_sentences[:args.batch_size] +def generate(): + """ returns a list of zipped inputs, outputs and number of new tokens """ + + input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to("cuda:0") + + outputs = model.generate(**input_tokens, **generate_kwargs) + + input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_tokens_lengths = [x.shape[0] for x in outputs] + + total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + return zip(inputs, outputs, total_new_tokens) + +# warmup is a must if measuring speed as it's when all the optimizations are performed +# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs +_ = generate() + +t_generate_start = time.time() +generated = generate() +t_generate_span = time.time() - t_generate_start +if rank == 0: + for i,o,_ in generated: + print(f"{'-'*60}\nin={i}\nout={o}\n") + + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + +### Benchmark + +if args.benchmark: + if rank == 0: + print(f"*** Running benchmark") + + # warm up + for i in range(1): + _ = generate() + torch.cuda.synchronize() + + # benchmark + t0 = time.time() + cycles = 5 + total_new_tokens_generated = 0 + for i in range(cycles): + generated = generate() + total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) + torch.cuda.synchronize() + if rank == 0: + througput = (time.time() - t0)/(total_new_tokens_generated) + print(f""" +*** Performance stats: +Throughput per token including tokenize: {througput*1000:.2f} msecs +Start to ready to generate: {t_ready - t_start:.3f} secs +Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Start to finish: {t_ready - t_start + t_generate_span:.3f} secs +""") diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/inference/bloom-ds-inference.py new file mode 100644 index 000000000..c21dfeb96 --- /dev/null +++ b/scripts/inference/bloom-ds-inference.py @@ -0,0 +1,299 @@ +# usage: +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom +# +# to run benchmarks: +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark +# + + +# This is going to improve, but at the moment, the process is a bit cumbersome - we first use +# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, +# 2. free the allocated storage +# 3. start Deepspeed-Inference and only now load the checkpoint +# 4. run generate +# Done. +# + + +from argparse import ArgumentParser +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig +from transformers.deepspeed import HfDeepSpeedConfig +from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock +import deepspeed +import gc +import glob +import io +import json +import math +import os +import sys +import time +import torch +import torch.distributed as dist + +t_start = time.time() + +num_tokens = 100 + +parser = ArgumentParser() + +parser.add_argument("--name", required=True, type=str, help="model_name") +parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") +parser.add_argument("--batch_size", default=1, type=int, help="batch size") +parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") +args = parser.parse_args() + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + +deepspeed.init_distributed('nccl') +rank = dist.get_rank() + + +### Model loading and instantiating on GPUs + +def get_checkpoint_files(pretrained_model_name_or_path): + # XXX: I just hacked this one together to automatically handle the fetching of the model file or + # shards into cache and returning the cached entries - note that I removed most arguments + + from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME, cached_path, hf_bucket_url, is_offline_mode + from transformers.utils.hub import EntryNotFoundError + from transformers.modeling_utils import get_checkpoint_shard_files + + cache_dir = None + is_sharded = False + + # XXX: preparation for revision branches if needed + revision = None + #revision = "sharded" + + # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with + # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')" + if is_offline_mode(): + print("Offline mode: forcing local_files_only=True") + local_files_only = True + else: + local_files_only = False + + filename = WEIGHTS_NAME + archive_file = hf_bucket_url(pretrained_model_name_or_path, filename=filename, revision=revision) + + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) + return [resolved_archive_file] + + except (EntryNotFoundError, FileNotFoundError): + if filename == WEIGHTS_NAME: + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + archive_file = hf_bucket_url( + pretrained_model_name_or_path, + filename=WEIGHTS_INDEX_NAME, + revision=revision, + ) + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + local_files_only=local_files_only, + ) + is_sharded = True + + if is_sharded: + # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + cache_dir=cache_dir, + revision=revision + ) + + return resolved_archive_file + +model_name = args.name + +#print(get_checkpoint_files(model_name)) + +if rank == 0: + print(f"*** Loading the model {model_name}") + +tokenizer = AutoTokenizer.from_pretrained(model_name) +config = AutoConfig.from_pretrained(model_name) + +# XXX: can't automatically derive dtype via config's `from_pretrained` +#dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 + + +# use one of these args to `init_inference` +# 1. injection_policy is the slower version, but it's plain pytorch so it'll always work +# 2. replace_with_kernel_inject is the faster one (fast fused kernels) +kernel_inject = True +#kernel_inject = False + +if kernel_inject: + # XXX: for now ds-inference only works with fp16 + dtype = torch.float16 +else: + dtype = torch.bfloat16 + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) + +# Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load +with deepspeed.OnDevice(dtype=dtype, device='meta'): + model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16) + +if args.benchmark: + deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) + +model = model.eval() + + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('post-init-ds-zero-init', force=True) + +### Deepspeed-Inference Loading + +checkpoints_json = "checkpoints.json" +def write_checkponts_json(): + + with io.open(checkpoints_json, 'w', encoding='utf-8') as f: + + #checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom" + #checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") + checkpoint_files = get_checkpoint_files(model_name) + + #print("Checkpoint files:", checkpoint_files) + + data = { + "type": "BLOOM-176B", + "checkpoints": checkpoint_files, + "version": 1.0 + } + json.dump(data, f) + +if rank == 0: + write_checkponts_json() +dist.barrier() + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('pre-ds-inference-init', force=True) + +if kernel_inject: + kwargs = dict(replace_with_kernel_inject=True) +else: + kwargs = dict(injection_policy={BloomBlock: ('self_attention.dense', 'mlp.dense_4h_to_h')}) + +#checkpoints_json=None +model = deepspeed.init_inference(model, + mp_size=world_size, + dtype=torch.half, + checkpoint=checkpoints_json, + **kwargs, + ) + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('post-ds-inference-init', force=True) + + +model = model.module + +if args.benchmark: + t_ready = time.time() + + +### Generate + +if rank == 0: + print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") + +input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way" +] + +if args.batch_size > len(input_sentences): + # dynamically extend to support larger bs by repetition + input_sentences *= math.ceil(args.batch_size / len(input_sentences)) + +generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) + +if rank == 0: + print(f"Generate args {generate_kwargs}") +inputs = input_sentences[:args.batch_size] +def generate(): + """ returns a list of zipped inputs, outputs and number of new tokens """ + + input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) + + outputs = model.generate(**input_tokens, **generate_kwargs) + + input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_tokens_lengths = [x.shape[0] for x in outputs] + + total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + return zip(inputs, outputs, total_new_tokens) + + +# warmup is a must if measuring speed as it's when all the optimizations are performed +# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs +_ = generate() + +t_generate_start = time.time() +generated = generate() +t_generate_span = time.time() - t_generate_start +if rank == 0: + for i,o,_ in generated: + print(f"{'-'*60}\nin={i}\nout={o}\n") + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) + +### Benchmark + +# benchmark it! +if args.benchmark: + if rank == 0: + print(f"*** Running benchmark") + + # warm up + for i in range(1): + _ = generate() + torch.cuda.synchronize() + + # benchmark + t0 = time.time() + cycles = 5 + total_new_tokens_generated = 0 + for i in range(cycles): + generated = generate() + total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) + torch.cuda.synchronize() + if rank == 0: + througput = (time.time() - t0)/(total_new_tokens_generated) + print(f""" +*** Performance stats: +Throughput per token including tokenize: {througput*1000:.2f} msecs +Start to ready to generate: {t_ready - t_start:.3f} secs +Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Start to finish: {t_ready - t_start + t_generate_span:.3f} secs +""") diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/inference/bloom-ds-zero-inference.py new file mode 100644 index 000000000..043b4967f --- /dev/null +++ b/scripts/inference/bloom-ds-zero-inference.py @@ -0,0 +1,211 @@ +# usage: +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom +# +# to run benchmarks: +# deepspeed --num_gpus 8 bloom-ds-inference.py --name bigscience/bloom --benchmark +# + + +# This is going to improve, but at the moment, the process is a bit cumbersome - we first use +# 1. use Deepspeed-ZeRO to instantiate the model on GPUs, w/o loading the checkpoints, +# 2. free the allocated storage +# 3. start Deepspeed-Inference and only now load the checkpoint +# 4. run generate +# Done. +# + + +from argparse import ArgumentParser +from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig +from transformers.deepspeed import HfDeepSpeedConfig +from transformers.models.bloom.modeling_bloom import BloomBlock as BloomBlock +import deepspeed +import gc +import glob +import io +import json +import math +import os +import sys +import time +import torch +import torch.distributed as dist + +t_start = time.time() + +num_tokens = 100 + +parser = ArgumentParser() + +parser.add_argument("--name", required=True, type=str, help="model_name") +parser.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") +parser.add_argument("--batch_size", default=1, type=int, help="batch size") +parser.add_argument("--benchmark", action="store_true", help="additionally run benchmark") +parser.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload") +args = parser.parse_args() + +local_rank = int(os.getenv('LOCAL_RANK', '0')) +world_size = int(os.getenv('WORLD_SIZE', '1')) + + +### Model loading and instantiating on GPU (via ZeRO) + +model_name = args.name + +if local_rank == 0: + print(f"*** Loading the model {model_name}") + +tokenizer = AutoTokenizer.from_pretrained(model_name) +config = AutoConfig.from_pretrained(model_name) + +# XXX: can't automatically derive dtype via config's `from_pretrained` +dtype = torch.bfloat16 if model_name in ["bigscience/bloom", "bigscience/bigscience-small-testing"] else torch.float16 + +model_hidden_size = config.hidden_size +train_batch_size = 1 * world_size + +ds_config = { + "fp16": { + "enabled": dtype == torch.float16, + }, + "bf16": { + "enabled": dtype == torch.bfloat16, + }, + "zero_optimization": { + "stage": 3, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": model_hidden_size * model_hidden_size, + "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, + "stage3_param_persistence_threshold": 0 + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False +} + +if args.cpu_offload: + ds_config["zero_optimization"]["offload_param"] = dict(device="cpu", pin_memory=True) + +dschf = HfDeepSpeedConfig(ds_config) # this tells from_pretrained to instantiate directly on gpus + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('pre-from-pretrained', force=True) + +model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) + +if args.benchmark: + deepspeed.runtime.utils.see_memory_usage('post-from-pretrained', force=True) + +model = model.eval() + +rank = dist.get_rank() + +if rank == 0: + print(ds_config) + +ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +ds_engine.module.eval() +model = ds_engine.module + +if args.benchmark: + t_ready = time.time() + + +### Generate + +if rank == 0: + print(f"*** Starting to generate {num_tokens} tokens with bs={args.batch_size}") + +input_sentences = [ + "DeepSpeed is a machine learning framework", + "He is working on", + "He has a", + "He got all", + "Everyone is happy and I can", + "The new movie that got Oscar this year", + "In the far far distance from our galaxy,", + "Peace is the only way" +] + +if args.batch_size > len(input_sentences): + # dynamically extend to support larger bs by repetition + input_sentences *= math.ceil(args.batch_size / len(input_sentences)) + +generate_kwargs = dict(max_new_tokens=num_tokens, do_sample=False) + +if rank == 0: + print(f"Generate args {generate_kwargs}") +inputs = input_sentences[:args.batch_size] +def generate(): + """ returns a list of zipped inputs, outputs and number of new tokens """ + + input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True) + for t in input_tokens: + if torch.is_tensor(input_tokens[t]): + input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) + + outputs = model.generate(**input_tokens, **generate_kwargs) + + input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids] + output_tokens_lengths = [x.shape[0] for x in outputs] + + total_new_tokens = [o-i for i,o in zip(input_tokens_lengths, output_tokens_lengths)] + outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + return zip(inputs, outputs, total_new_tokens) + +# XXX: this is currently doing world_size streams on world_size gpus, so we can feed it different inputs on each! and hence the time can be divided by world_size + +# warmup is a must if measuring speed as it's when all the optimizations are performed +# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs +_ = generate() + +t_generate_start = time.time() +pairs = generate() +t_generate_span = time.time() - t_generate_start +if rank == 0: + for i,o,_ in pairs: + print(f"{'-'*60}\nin={i}\nout={o}\n") + + +if args.benchmark: + torch.cuda.empty_cache() + gc.collect() + deepspeed.runtime.utils.see_memory_usage('end-of-run', force=True) + +### Benchmark + +if args.benchmark: + if rank == 0: + print(f"*** Running benchmark") + + # warm up + for i in range(1): + _ = generate() + torch.cuda.synchronize() + + # benchmark + t0 = time.time() + cycles = 5 + total_new_tokens_generated = 0 + for i in range(cycles): + generated = generate() + total_new_tokens_generated += sum(new_tokens for _,_,new_tokens in generated) + + torch.cuda.synchronize() + if rank == 0: + # note that we actually generate world_size unique streams (though the benchmark feeds the same inputs) + total_new_tokens_generated *= world_size + througput = (time.time() - t0)/(total_new_tokens_generated) + print(f""" +*** Performance stats: +Throughput per token including tokenize: {througput*1000:.2f} msecs +Start to ready to generate: {t_ready - t_start:.3f} secs +Tokenize and generate {total_new_tokens_generated} (bs={args.batch_size}) tokens: {t_generate_span:.3f} secs +Start to finish: {t_ready - t_start + t_generate_span:.3f} secs +""") + From aa8c08c107e8a1ed57457787c8df2cd25d902503 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 20 Aug 2022 03:20:26 +0530 Subject: [PATCH 20/32] move scripts --- {inference => scripts/bloom-inference-scripts}/README.md | 0 .../bloom-accelerate-inference.py | 0 .../{inference => bloom-inference-scripts}/bloom-ds-inference.py | 0 .../bloom-ds-zero-inference.py | 0 scripts/{inference => bloom-inference-server}/README.md | 0 {inference => scripts/bloom-inference-server}/benchmark.py | 0 .../bloom-inference-server}/cache_ds_checkpoints.py | 0 {inference => scripts/bloom-inference-server}/cli.py | 0 {inference => scripts/bloom-inference-server}/constants.py | 0 .../bloom-inference-server}/ds_inference/__init__.py | 0 .../bloom-inference-server}/ds_inference/cache.py | 0 .../bloom-inference-server}/ds_inference/grpc_server.py | 0 .../bloom-inference-server}/ds_inference/model.py | 0 {inference => scripts/bloom-inference-server}/ds_zero/__init__.py | 0 {inference => scripts/bloom-inference-server}/ds_zero/model.py | 0 .../bloom-inference-server}/hf_accelerate/__init__.py | 0 .../bloom-inference-server}/hf_accelerate/model.py | 0 {inference => scripts/bloom-inference-server}/server.py | 0 {inference => scripts/bloom-inference-server}/utils/__init__.py | 0 {inference => scripts/bloom-inference-server}/utils/model.py | 0 {inference => scripts/bloom-inference-server}/utils/requests.py | 0 {inference => scripts/bloom-inference-server}/utils/utils.py | 0 22 files changed, 0 insertions(+), 0 deletions(-) rename {inference => scripts/bloom-inference-scripts}/README.md (100%) rename scripts/{inference => bloom-inference-scripts}/bloom-accelerate-inference.py (100%) rename scripts/{inference => bloom-inference-scripts}/bloom-ds-inference.py (100%) rename scripts/{inference => bloom-inference-scripts}/bloom-ds-zero-inference.py (100%) rename scripts/{inference => bloom-inference-server}/README.md (100%) rename {inference => scripts/bloom-inference-server}/benchmark.py (100%) rename {inference => scripts/bloom-inference-server}/cache_ds_checkpoints.py (100%) rename {inference => scripts/bloom-inference-server}/cli.py (100%) rename {inference => scripts/bloom-inference-server}/constants.py (100%) rename {inference => scripts/bloom-inference-server}/ds_inference/__init__.py (100%) rename {inference => scripts/bloom-inference-server}/ds_inference/cache.py (100%) rename {inference => scripts/bloom-inference-server}/ds_inference/grpc_server.py (100%) rename {inference => scripts/bloom-inference-server}/ds_inference/model.py (100%) rename {inference => scripts/bloom-inference-server}/ds_zero/__init__.py (100%) rename {inference => scripts/bloom-inference-server}/ds_zero/model.py (100%) rename {inference => scripts/bloom-inference-server}/hf_accelerate/__init__.py (100%) rename {inference => scripts/bloom-inference-server}/hf_accelerate/model.py (100%) rename {inference => scripts/bloom-inference-server}/server.py (100%) rename {inference => scripts/bloom-inference-server}/utils/__init__.py (100%) rename {inference => scripts/bloom-inference-server}/utils/model.py (100%) rename {inference => scripts/bloom-inference-server}/utils/requests.py (100%) rename {inference => scripts/bloom-inference-server}/utils/utils.py (100%) diff --git a/inference/README.md b/scripts/bloom-inference-scripts/README.md similarity index 100% rename from inference/README.md rename to scripts/bloom-inference-scripts/README.md diff --git a/scripts/inference/bloom-accelerate-inference.py b/scripts/bloom-inference-scripts/bloom-accelerate-inference.py similarity index 100% rename from scripts/inference/bloom-accelerate-inference.py rename to scripts/bloom-inference-scripts/bloom-accelerate-inference.py diff --git a/scripts/inference/bloom-ds-inference.py b/scripts/bloom-inference-scripts/bloom-ds-inference.py similarity index 100% rename from scripts/inference/bloom-ds-inference.py rename to scripts/bloom-inference-scripts/bloom-ds-inference.py diff --git a/scripts/inference/bloom-ds-zero-inference.py b/scripts/bloom-inference-scripts/bloom-ds-zero-inference.py similarity index 100% rename from scripts/inference/bloom-ds-zero-inference.py rename to scripts/bloom-inference-scripts/bloom-ds-zero-inference.py diff --git a/scripts/inference/README.md b/scripts/bloom-inference-server/README.md similarity index 100% rename from scripts/inference/README.md rename to scripts/bloom-inference-server/README.md diff --git a/inference/benchmark.py b/scripts/bloom-inference-server/benchmark.py similarity index 100% rename from inference/benchmark.py rename to scripts/bloom-inference-server/benchmark.py diff --git a/inference/cache_ds_checkpoints.py b/scripts/bloom-inference-server/cache_ds_checkpoints.py similarity index 100% rename from inference/cache_ds_checkpoints.py rename to scripts/bloom-inference-server/cache_ds_checkpoints.py diff --git a/inference/cli.py b/scripts/bloom-inference-server/cli.py similarity index 100% rename from inference/cli.py rename to scripts/bloom-inference-server/cli.py diff --git a/inference/constants.py b/scripts/bloom-inference-server/constants.py similarity index 100% rename from inference/constants.py rename to scripts/bloom-inference-server/constants.py diff --git a/inference/ds_inference/__init__.py b/scripts/bloom-inference-server/ds_inference/__init__.py similarity index 100% rename from inference/ds_inference/__init__.py rename to scripts/bloom-inference-server/ds_inference/__init__.py diff --git a/inference/ds_inference/cache.py b/scripts/bloom-inference-server/ds_inference/cache.py similarity index 100% rename from inference/ds_inference/cache.py rename to scripts/bloom-inference-server/ds_inference/cache.py diff --git a/inference/ds_inference/grpc_server.py b/scripts/bloom-inference-server/ds_inference/grpc_server.py similarity index 100% rename from inference/ds_inference/grpc_server.py rename to scripts/bloom-inference-server/ds_inference/grpc_server.py diff --git a/inference/ds_inference/model.py b/scripts/bloom-inference-server/ds_inference/model.py similarity index 100% rename from inference/ds_inference/model.py rename to scripts/bloom-inference-server/ds_inference/model.py diff --git a/inference/ds_zero/__init__.py b/scripts/bloom-inference-server/ds_zero/__init__.py similarity index 100% rename from inference/ds_zero/__init__.py rename to scripts/bloom-inference-server/ds_zero/__init__.py diff --git a/inference/ds_zero/model.py b/scripts/bloom-inference-server/ds_zero/model.py similarity index 100% rename from inference/ds_zero/model.py rename to scripts/bloom-inference-server/ds_zero/model.py diff --git a/inference/hf_accelerate/__init__.py b/scripts/bloom-inference-server/hf_accelerate/__init__.py similarity index 100% rename from inference/hf_accelerate/__init__.py rename to scripts/bloom-inference-server/hf_accelerate/__init__.py diff --git a/inference/hf_accelerate/model.py b/scripts/bloom-inference-server/hf_accelerate/model.py similarity index 100% rename from inference/hf_accelerate/model.py rename to scripts/bloom-inference-server/hf_accelerate/model.py diff --git a/inference/server.py b/scripts/bloom-inference-server/server.py similarity index 100% rename from inference/server.py rename to scripts/bloom-inference-server/server.py diff --git a/inference/utils/__init__.py b/scripts/bloom-inference-server/utils/__init__.py similarity index 100% rename from inference/utils/__init__.py rename to scripts/bloom-inference-server/utils/__init__.py diff --git a/inference/utils/model.py b/scripts/bloom-inference-server/utils/model.py similarity index 100% rename from inference/utils/model.py rename to scripts/bloom-inference-server/utils/model.py diff --git a/inference/utils/requests.py b/scripts/bloom-inference-server/utils/requests.py similarity index 100% rename from inference/utils/requests.py rename to scripts/bloom-inference-server/utils/requests.py diff --git a/inference/utils/utils.py b/scripts/bloom-inference-server/utils/utils.py similarity index 100% rename from inference/utils/utils.py rename to scripts/bloom-inference-server/utils/utils.py From 25b5d851df6fd25be061f4bd11185062e3753485 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 20 Aug 2022 05:43:20 +0530 Subject: [PATCH 21/32] drop data --- scripts/bloom-inference-server/server.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index b35e6274c..4197d74a2 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -28,7 +28,6 @@ def get_args() -> argparse.Namespace: ) group.add_argument("--save_mp_checkpoint_path", required=False, type=str, help="MP checkpoints path for DS inference") - group.add_argument("--log_file", type=str, help="log data") group.add_argument("--host", type=str, required=True, help="host address") group.add_argument("--port", type=int, required=True, help="port number") group.add_argument("--workers", type=int, default=1, @@ -40,6 +39,9 @@ def get_args() -> argparse.Namespace: args = utils.get_args(parser) + if (args.save_mp_checkpoint_path): + assert args.deployment_framework == constants.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" + return args @@ -47,13 +49,6 @@ def get_args() -> argparse.Namespace: args = get_args() app = FastAPI() -# Setup logging -logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d : %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.INFO, - filename=args.log_file -) logger = logging.getLogger(__name__) if (args.deployment_framework == constants.HF_ACCELERATE): From 92017708089172a8f452b1e10132f164efc148c2 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 20 Aug 2022 16:44:20 +0530 Subject: [PATCH 22/32] minor changes + add README --- scripts/bloom-inference-server/README.md | 201 ++++-------------- scripts/bloom-inference-server/cli.py | 11 +- .../ds_inference/cache.py | 3 +- .../examples/server_request.py | 37 ++++ scripts/bloom-inference-server/server.py | 15 +- .../bloom-inference-server/utils/__init__.py | 1 + scripts/bloom-inference-server/utils/utils.py | 15 +- 7 files changed, 109 insertions(+), 174 deletions(-) create mode 100644 scripts/bloom-inference-server/examples/server_request.py diff --git a/scripts/bloom-inference-server/README.md b/scripts/bloom-inference-server/README.md index 44e98f9fb..01c246830 100644 --- a/scripts/bloom-inference-server/README.md +++ b/scripts/bloom-inference-server/README.md @@ -1,195 +1,84 @@ -# Inference scripts for BLOOM +## Inference solutions for BLOOM 176B +We support HuggingFace accelerate and DeepSpeed Inference for generation. -## BLOOM Inference solutions +Required packages: +1. [DeepSpeed](https://github.com/microsoft/DeepSpeed) +1. [DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII) +1. [HuggingFace accelerate](https://github.com/huggingface/accelerate) -Here are some stats on JeanZay's 8x80GB A100 node w/ 512GB of CPU memory: +All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B. These scripts might not work for other models or a different number of GPUs. +DS inference only supports fp16 for cli and server application. However, for benchmarking, it supports both fp16 and bf16. bf16 support will be added once DeepSpeed adds suitable CUDA kernels for these. -All benchmarks are doing greedy generation of 100 token outputs: +DS inference is deployed using the DeepSpeed MII library which requires the resharded checkpoints for 8 x Tensor Parallel. The HuggingFace checkpoints can be resharded and cached using the following command: ``` -Generate args {'min_length': 100, 'max_length': 100, 'do_sample': False} +deepspeed --num_gpus 8 scripts/bloom-inference-server/cache_ds_checkpoints.py --model_name bigscience/bloom --dtype fp16 --save_mp_checkpoint_path ``` -The inputs are just a few tokens. - -Throughput in msecs: - -| project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | -| :----------- | :---- | :---- | :---- | :---- | :---- | :--- | -| accelerate | 230.38 | 31.78 | 17.84 | 10.89 | oom | omm | -| ds-inference | 40.57 | 5.23 | | | 2.77 | 0.66 | -| ds-zero | 283 | 34.88 | oom | oom | oom | oom | - - -Start to ready to generate in secs: - -| project \ bs | 1 | 8 | 16 | 32 | 64 | 128 | -| :----------- | :--- | :--- | :--- | :--- | :--- | :--- | -| accelerate | 121 | 120 | 113 | 118 | | | -| ds-inference | 662 | 673 | | | 685 | 654 | -| ds-zero | 462 | 463 | | | | | -| | | | | | | | - - -DS-Inference load time (start to ready to generate) will become much faster soon. Once we stop relying on ds-zero to instantiate the model on gpu. The plan is to pre-shard the weights TP-wise for 8x and 16x gpus and load them directly on each gpu. Will probably be under 1min. - - -## Deepspeed-Inference - -Tensor-Parallelism and efficient fused CUDA kernels: -https://www.deepspeed.ai/tutorials/inference-tutorial/ - -### Setup +Note: Running the above script will consume ~350 GB of disk space and will take some time (~30 minutes), depending on both the speed of your GPUs and storage. +#### BLOOM inference via command-line +This asks for generate_kwargs everytime. +Example: generate_kwargs = ``` -git clone https://github.com/microsoft/DeepSpeed -cd DeepSpeed -pip install . +{"min_length": 100, "max_new_tokens": 100, "do_sample": false} ``` -### Run - +1. using HF accelerate ``` -deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom +python scripts/bloom-inference-server/cli.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' ``` -Performance on a single node of 8x80GB A100 w/ 512GB CPU RAM (JeanZay) - just a batch of 1 (would be more efficient to run a larger batch) - -Adding `--benchmark` to activate the benchmarks - - -BS=1 +2. using DS inference ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-inference_bs=1.txt -[...] - +python scripts/bloom-inference-server/cli.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' ``` -While processing memory per process: - -- GPU: ~50GB -- CPU: ~10GB - - -BS=8 +#### BLOOM server deployment +1. using HF accelerate ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-inference_bs=8.txt -[...] -*** Performance stats: -Throughput per token including tokenize: 5.23 msecs -Start to ready to generate: 683.397 secs -Tokenize and generate 800 (bs=8) tokens: 4.241 secs -Start to finish: 687.638 secs +python scripts/bloom-inference-server/server.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --host --port --allowed_max_new_tokens 100 ``` -BS=64 - +2. using DS inference ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 64 --benchmark 2>&1 | tee bloom-ds-inference_bs=64.txt - - - - +python scripts/bloom-inference-server/server.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --host --port --allowed_max_new_tokens 100 ``` -BS=128 +We provide an example [script](examples/server_request.py) to query the BLOOM server is provided. +#### Benchmark system for BLOOM inference +1. using HF accelerate ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-inference.py --name bigscience/bloom --batch_size 128 --benchmark 2>&1 | tee bloom-ds-inference_bs=128.txt - - - - +python scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5 ``` -## Deepspeed ZeRO-Inference - -https://www.deepspeed.ai/tutorials/zero/ - -### Setup - +2. using DS inference ``` -pip install deepspeed +deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --benchmark_cycles 5 ``` - -### Run - -Note that the script currently runs the same inputs on all GPUs, but you can run a different stream on each GPU, and get `n_gpu` times faster throughput. You can't do that with Deepspeed-Inference. - - -BS=1 - +3. using DS ZeRO ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt -[...] -*** Performance stats: -Throughput per token including tokenize: 282.93 msecs -Start to ready to generate: 501.871 secs -Tokenize and generate 800 (bs=1) tokens: 226.188 secs -Start to finish: 728.060 secs +deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5 ``` - -BS=8 - +Alternatively, the following shell script will benchmark different batch sizes for the model. ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt -[...] +mkdir -p logs -*** Performance stats: -Throughput per token including tokenize: 34.57 msecs -Start to ready to generate: 482.132 secs -Tokenize and generate 6400 (bs=8) tokens: 221.236 secs -Start to finish: 703.368 secs -``` +for bs in {1,2,4,8,16,32,64,128} +do + python scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5 --batch_size $bs 2>&1 | tee logs/hf-$bs.log -BS=16 and higher OOMs + deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --benchmark_cycles 5 --batch_size $bs 2>&1 | tee logs/ds-$bs.log + deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5 --batch_size $bs 2>&1 | tee logs/ds-zero-$bs.log +done ``` -$ deepspeed --num_gpus 8 scripts/inference/bloom-ds-zero-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt -[...] -OOM +The following will benchmark sequence length for batch size = 1 on DS inference. ``` - - - -## HF Accelerate - -https://github.com/huggingface/accelerate - -### Setup - -``` -pip install transformers -``` - - - -### Run - - - - -BS=1 -``` -$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 1 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=1.txt -[...] - - -``` - -BS=8 -``` -$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 8 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=8.txt -[...] - - -``` - -BS=16 -``` -$ python scripts/inference/bloom-accelerate-inference.py --name bigscience/bloom --batch_size 16 --benchmark 2>&1 | tee bloom-ds-zero-inference_bs=16.txt -[...] - - +for sq in {1,10,50,100,200,300,400,500,600,700,800,900,1000,1500,2000,2500,3000,3500,4000,4500,5000} +do + deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --batch_size 1 --benchmark_cycles 5 --deployment_framework ds_inference --generate_kwargs '{"do_sample": false, "min_length": '$sq', "max_new_tokens": '$sq'}' 2>&1 | tee logs/ds_$sq.log +done ``` diff --git a/scripts/bloom-inference-server/cli.py b/scripts/bloom-inference-server/cli.py index 3ab5e356e..ea170e25d 100644 --- a/scripts/bloom-inference-server/cli.py +++ b/scripts/bloom-inference-server/cli.py @@ -1,5 +1,6 @@ import argparse import json +import sys import constants import utils @@ -58,7 +59,15 @@ def main() -> None: model.shutdown() if (input("change generate_kwargs? [y/n] ") == "y"): - generate_kwargs = json.loads(input("Generate kwargs: ")) + while (True): + try: + generate_kwargs = json.loads(input("Generate kwargs: ")) + break + except Exception as e: + e_type, e_message, _ = sys.exc_info() + print("error =", e_type.__name__) + print("message =", e_message) + continue request = parse_generate_kwargs(input_text, generate_kwargs) response = model.generate(request) diff --git a/scripts/bloom-inference-server/ds_inference/cache.py b/scripts/bloom-inference-server/ds_inference/cache.py index 00dffab89..cbbf1b6f6 100644 --- a/scripts/bloom-inference-server/ds_inference/cache.py +++ b/scripts/bloom-inference-server/ds_inference/cache.py @@ -12,7 +12,8 @@ def cache_ds_checkpoints(args: argparse.Namespace) -> None: - print_rank_n("Loading model...") + if (args.local_rank == 0): + print_rank_n("Loading model...") world_size = int(os.getenv("WORLD_SIZE", "1")) # Load model diff --git a/scripts/bloom-inference-server/examples/server_request.py b/scripts/bloom-inference-server/examples/server_request.py new file mode 100644 index 000000000..675253685 --- /dev/null +++ b/scripts/bloom-inference-server/examples/server_request.py @@ -0,0 +1,37 @@ +import requests + + +def main(): + url = "http://127.0.0.1:5000/generate/" + + request_body = { + "text": "DeepSpeed is a machine learning framework", + "max_new_tokens": 40 + } + response = requests.post( + url=url, + json=request_body, + verify=False + ) + print(response.json()) + # ------------------------------------------------------------------------- + # higher batch size + request_body = { + "text": [ + "DeepSpeed", + "DeepSpeed is a", + "DeepSpeed is a machine", + "DeepSpeed is a machine learning framework", + ], + "max_new_tokens": 40 + } + response = requests.post( + url=url, + json=request_body, + verify=False + ) + print(response.json()) + + +if (__name__ == "__main__"): + main() diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 4197d74a2..291dcf004 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -2,14 +2,13 @@ import logging import sys import time -import traceback import constants import utils from ds_inference import DSInferenceGRPCServer from fastapi import FastAPI, HTTPException from hf_accelerate import HFAccelerateModel -from utils import GenerateRequest, get_argument_parser, get_num_tokens_to_generate +from utils import GenerateRequest, get_argument_parser, get_num_tokens_to_generate, get_stack_trace from uvicorn import run @@ -60,18 +59,6 @@ def get_args() -> argparse.Namespace: #################################################################################### -def get_stack_trace(e_stack_trace): - trace_back = traceback.extract_tb(e_stack_trace) - - # Format stacktrace - stack_trace = [] - for trace in trace_back: - stack_trace.append("File : {}, Line : {}, Func.Name : {}, Message : {}".format( - trace[0], trace[1], trace[2], trace[3])) - - return stack_trace - - @app.post("/generate/") def generate(request: GenerateRequest) -> dict: # needs to be global since it is updated diff --git a/scripts/bloom-inference-server/utils/__init__.py b/scripts/bloom-inference-server/utils/__init__.py index 77bb4ceaa..bece92d2e 100644 --- a/scripts/bloom-inference-server/utils/__init__.py +++ b/scripts/bloom-inference-server/utils/__init__.py @@ -6,6 +6,7 @@ get_argument_parser, get_dummy_batch, get_num_tokens_to_generate, + get_stack_trace, get_str_dtype, print_rank_n, run_rank_n diff --git a/scripts/bloom-inference-server/utils/utils.py b/scripts/bloom-inference-server/utils/utils.py index 20eb393ac..110e4b199 100644 --- a/scripts/bloom-inference-server/utils/utils.py +++ b/scripts/bloom-inference-server/utils/utils.py @@ -2,13 +2,12 @@ import copy import json import math +import traceback from typing import Any, List import torch import torch.distributed as dist -from .requests import GenerateRequest - dummy_input_sentences = [ "DeepSpeed is a machine learning framework", @@ -114,3 +113,15 @@ def get_num_tokens_to_generate(max_new_tokens: int, return allowed_max_new_tokens else: return min(max_new_tokens, allowed_max_new_tokens) + + +def get_stack_trace(e_stack_trace): + trace_back = traceback.extract_tb(e_stack_trace) + + # Format stacktrace + stack_trace = [] + for trace in trace_back: + stack_trace.append("File : {}, Line : {}, Func.Name : {}, Message : {}".format( + trace[0], trace[1], trace[2], trace[3])) + + return stack_trace From 649d7f8336ab6d948fa9aaceccd320c13a9b6b41 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 20 Aug 2022 16:51:21 +0530 Subject: [PATCH 23/32] update README --- scripts/bloom-inference-server/README.md | 29 +++++++++++++++--------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/scripts/bloom-inference-server/README.md b/scripts/bloom-inference-server/README.md index 01c246830..7a0ef3fc3 100644 --- a/scripts/bloom-inference-server/README.md +++ b/scripts/bloom-inference-server/README.md @@ -10,36 +10,43 @@ All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B. These sc DS inference only supports fp16 for cli and server application. However, for benchmarking, it supports both fp16 and bf16. bf16 support will be added once DeepSpeed adds suitable CUDA kernels for these. DS inference is deployed using the DeepSpeed MII library which requires the resharded checkpoints for 8 x Tensor Parallel. The HuggingFace checkpoints can be resharded and cached using the following command: -``` +```shell deepspeed --num_gpus 8 scripts/bloom-inference-server/cache_ds_checkpoints.py --model_name bigscience/bloom --dtype fp16 --save_mp_checkpoint_path ``` Note: Running the above script will consume ~350 GB of disk space and will take some time (~30 minutes), depending on both the speed of your GPUs and storage. +Note: sometimes GPU memory is not freed when DS inference deployment is shutdown. You can free this memory by running: +```python +import mii +mii.terminate("ds_inference_grpc_server") +``` +or alternatively, just doing a `killall python` in terminal. + #### BLOOM inference via command-line This asks for generate_kwargs everytime. Example: generate_kwargs = -``` +```json {"min_length": 100, "max_new_tokens": 100, "do_sample": false} ``` 1. using HF accelerate -``` +```shell python scripts/bloom-inference-server/cli.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' ``` 2. using DS inference -``` +```shell python scripts/bloom-inference-server/cli.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --generate_kwargs '{"min_length": 100, "max_new_tokens": 100, "do_sample": false}' ``` #### BLOOM server deployment 1. using HF accelerate -``` +```shell python scripts/bloom-inference-server/server.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --host --port --allowed_max_new_tokens 100 ``` 2. using DS inference -``` +```shell python scripts/bloom-inference-server/server.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --host --port --allowed_max_new_tokens 100 ``` @@ -47,22 +54,22 @@ We provide an example [script](examples/server_request.py) to query the BLOOM se #### Benchmark system for BLOOM inference 1. using HF accelerate -``` +```shell python scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework hf_accelerate --benchmark_cycles 5 ``` 2. using DS inference -``` +```shell deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --benchmark_cycles 5 ``` 3. using DS ZeRO -``` +```shell deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype bf16 --deployment_framework ds_zero --benchmark_cycles 5 ``` Alternatively, the following shell script will benchmark different batch sizes for the model. -``` +```shell mkdir -p logs for bs in {1,2,4,8,16,32,64,128} @@ -76,7 +83,7 @@ done ``` The following will benchmark sequence length for batch size = 1 on DS inference. -``` +```shell for sq in {1,10,50,100,200,300,400,500,600,700,800,900,1000,1500,2000,2500,3000,3500,4000,4500,5000} do deepspeed --num_gpus 8 scripts/bloom-inference-server/benchmark.py --model_name bigscience/bloom --dtype fp16 --batch_size 1 --benchmark_cycles 5 --deployment_framework ds_inference --generate_kwargs '{"do_sample": false, "min_length": '$sq', "max_new_tokens": '$sq'}' 2>&1 | tee logs/ds_$sq.log From 997a5fa2571cd86fbcf3c9de16fb4f611bac1130 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sun, 21 Aug 2022 22:12:22 +0530 Subject: [PATCH 24/32] drop nccl --- scripts/bloom-inference-server/benchmark.py | 1 - scripts/bloom-inference-server/server.py | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/bloom-inference-server/benchmark.py b/scripts/bloom-inference-server/benchmark.py index f2c60258d..b4266a007 100644 --- a/scripts/bloom-inference-server/benchmark.py +++ b/scripts/bloom-inference-server/benchmark.py @@ -176,7 +176,6 @@ def main() -> None: if (args.deployment_framework == constants.HF_ACCELERATE): benchmark_end_to_end(args, HFAccelerateModel) elif (args.deployment_framework == constants.DS_INFERENCE): - deepspeed.init_distributed("nccl") benchmark_end_to_end(args, DSInferenceModel) elif (args.deployment_framework == constants.DS_ZERO): benchmark_end_to_end(args, DSZeROModel, zero_activated=True) diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 291dcf004..55d36cd63 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -54,6 +54,9 @@ def get_args() -> argparse.Namespace: model = HFAccelerateModel(args) elif (args.deployment_framework == constants.DS_INFERENCE): model = DSInferenceGRPCServer(args) +else: + raise ValueError( + f"Unknown deployment framework {args.deployment_framework}") query_id = 0 #################################################################################### From 85d9fcb8d093152647838ee31bc9c803f129a6c5 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 24 Aug 2022 10:09:34 +0530 Subject: [PATCH 25/32] fix --- scripts/bloom-inference-server/README.md | 26 +++++- scripts/bloom-inference-server/benchmark.py | 28 +----- .../ds_inference/grpc_server.py | 9 +- .../ds_inference/model.py | 5 +- .../bloom-inference-server/ds_zero/model.py | 1 + .../examples/server_request.py | 60 +++++++++--- .../hf_accelerate/model.py | 1 + scripts/bloom-inference-server/server.py | 92 ++++++++++++++----- .../bloom-inference-server/utils/__init__.py | 13 ++- scripts/bloom-inference-server/utils/model.py | 33 ++++--- .../bloom-inference-server/utils/requests.py | 24 +++-- scripts/bloom-inference-server/utils/utils.py | 44 +++++---- 12 files changed, 227 insertions(+), 109 deletions(-) diff --git a/scripts/bloom-inference-server/README.md b/scripts/bloom-inference-server/README.md index 7a0ef3fc3..6d140b36f 100644 --- a/scripts/bloom-inference-server/README.md +++ b/scripts/bloom-inference-server/README.md @@ -1,10 +1,23 @@ ## Inference solutions for BLOOM 176B We support HuggingFace accelerate and DeepSpeed Inference for generation. -Required packages: -1. [DeepSpeed](https://github.com/microsoft/DeepSpeed) -1. [DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII) -1. [HuggingFace accelerate](https://github.com/huggingface/accelerate) +Install required packages: + +```shell +pip install fastapi uvicorn accelerate huggingface_hub>=0.9.0 +``` +To install [DeepSpeed](https://github.com/microsoft/DeepSpeed): +```shell +git clone https://github.com/microsoft/DeepSpeed +cd DeepSpeed +CFLAGS="-I$CONDA_PREFIX/include/" LDFLAGS="-L$CONDA_PREFIX/lib/" TORCH_CUDA_ARCH_LIST="7.0" DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check +``` +To install [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII): +```shell +git clone https://github.com/microsoft/DeepSpeed-MII +cd DeepSpeed-MII +pip install . +``` All the provided scripts are tested on 8 A100 80GB GPUs for BLOOM 176B. These scripts might not work for other models or a different number of GPUs. DS inference only supports fp16 for cli and server application. However, for benchmarking, it supports both fp16 and bf16. bf16 support will be added once DeepSpeed adds suitable CUDA kernels for these. @@ -50,7 +63,10 @@ python scripts/bloom-inference-server/server.py --model_name bigscience/bloom -- python scripts/bloom-inference-server/server.py --model_name bigscience/bloom --dtype fp16 --deployment_framework ds_inference --save_mp_checkpoint_path --host --port --allowed_max_new_tokens 100 ``` -We provide an example [script](examples/server_request.py) to query the BLOOM server is provided. +We provide an example [script](examples/server_request.py) to query the BLOOM server is provided. To run this script: +```shell +python scripts/bloom-inference-server/examples/server_request.py --host --port +``` #### Benchmark system for BLOOM inference 1. using HF accelerate diff --git a/scripts/bloom-inference-server/benchmark.py b/scripts/bloom-inference-server/benchmark.py index b4266a007..af260c99c 100644 --- a/scripts/bloom-inference-server/benchmark.py +++ b/scripts/bloom-inference-server/benchmark.py @@ -1,10 +1,7 @@ import argparse import gc import os -import time -from typing import Any, List, Tuple, Union -import deepspeed import torch import constants @@ -13,33 +10,16 @@ from ds_zero import DSZeROModel from hf_accelerate import HFAccelerateModel from utils import ( - Execute, GenerateRequest, Model, get_argument_parser, get_dummy_batch, parse_generate_kwargs, - print_rank_n + print_rank_n, + run_and_log_time ) -def run_and_log_time(execs: Union[List[Execute], Execute]) -> Tuple[Union[List[Any], Any], float]: - """ - runs a list of Execute objects and returns a list of outputs and the time taken - """ - start_time = time.time() - - if (type(execs) == list): - results = [] - for e in execs: - results.append(e()) - else: - results = execs() - - time_elapsed = time.time() - start_time - return results, time_elapsed - - def benchmark_generation(model: Model, request: GenerateRequest, cycles: int = 5): @@ -73,7 +53,7 @@ def benchmark_end_to_end(args: argparse.Namespace, model_class: Model, zero_activated: bool = False) -> None: model, initialization_time = run_and_log_time( - Execute(model_class, {"args": args}) + (model_class, {"args": args}) ) request = parse_generate_kwargs( @@ -103,7 +83,7 @@ def benchmark_end_to_end(args: argparse.Namespace, # benchmark total_new_tokens_generated, benchmark_time = run_and_log_time( - Execute( + ( benchmark_generation, { "model": model, diff --git a/scripts/bloom-inference-server/ds_inference/grpc_server.py b/scripts/bloom-inference-server/ds_inference/grpc_server.py index 30cace136..bee503ef4 100644 --- a/scripts/bloom-inference-server/ds_inference/grpc_server.py +++ b/scripts/bloom-inference-server/ds_inference/grpc_server.py @@ -6,7 +6,13 @@ from transformers import AutoTokenizer import mii -from utils import GenerateRequest, GenerateResponse, Model, get_filter_dict, get_str_dtype +from utils import ( + GenerateRequest, + GenerateResponse, + Model, + get_filter_dict, + get_str_dtype +) class DSInferenceGRPCServer(Model): @@ -40,6 +46,7 @@ def __init__(self, args: argparse.Namespace) -> None: raise NotImplementedError("This is not yet supported") self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.pad = self.tokenizer.pad_token_id self.model = mii.mii_query_handle(self.deployment_name) def generate(self, request: GenerateRequest) -> GenerateResponse: diff --git a/scripts/bloom-inference-server/ds_inference/model.py b/scripts/bloom-inference-server/ds_inference/model.py index 9ee6e79c2..d31b39c6a 100644 --- a/scripts/bloom-inference-server/ds_inference/model.py +++ b/scripts/bloom-inference-server/ds_inference/model.py @@ -16,10 +16,13 @@ class DSInferenceModel(Model): def __init__(self, args: Namespace) -> None: - print_rank_n("Loading model...") + if (args.local_rank == 0): + # print_rank_n won't work here since deepspeed is not initialized yet + print_rank_n("Loading model...") world_size = int(os.getenv("WORLD_SIZE", "1")) self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.pad = self.tokenizer.pad_token_id # Load model with deepspeed.OnDevice(dtype=args.dtype, device="meta"): diff --git a/scripts/bloom-inference-server/ds_zero/model.py b/scripts/bloom-inference-server/ds_zero/model.py index 0db700869..7106b2f6e 100644 --- a/scripts/bloom-inference-server/ds_zero/model.py +++ b/scripts/bloom-inference-server/ds_zero/model.py @@ -50,6 +50,7 @@ def __init__(self, args: Namespace) -> None: dschf = HfDeepSpeedConfig(ds_config) self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.pad = self.tokenizer.pad_token_id self.model = AutoModelForCausalLM.from_pretrained( args.model_name, torch_dtype=args.dtype) diff --git a/scripts/bloom-inference-server/examples/server_request.py b/scripts/bloom-inference-server/examples/server_request.py index 675253685..d843ecd65 100644 --- a/scripts/bloom-inference-server/examples/server_request.py +++ b/scripts/bloom-inference-server/examples/server_request.py @@ -1,11 +1,28 @@ +import argparse + import requests -def main(): - url = "http://127.0.0.1:5000/generate/" +def get_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + + group = parser.add_argument_group(title="launch config") + group.add_argument("--host", type=str, required=True, help="host address") + group.add_argument("--port", type=int, required=True, help="port number") + + return parser.parse_args() + + +def generate(url: str) -> None: + url = url + "/generate/" request_body = { - "text": "DeepSpeed is a machine learning framework", + "text": [ + "DeepSpeed", + "DeepSpeed is a", + "DeepSpeed is a machine", + "DeepSpeed is a machine learning framework", + ], "max_new_tokens": 40 } response = requests.post( @@ -13,24 +30,43 @@ def main(): json=request_body, verify=False ) - print(response.json()) - # ------------------------------------------------------------------------- - # higher batch size + print(response.json(), "\n") + + +def tokenize(url: str) -> None: + url = url + "/tokenize/" + request_body = { "text": [ - "DeepSpeed", "DeepSpeed is a", - "DeepSpeed is a machine", - "DeepSpeed is a machine learning framework", - ], - "max_new_tokens": 40 + "DeepSpeed is a machine learning framework" + ] } response = requests.post( url=url, json=request_body, verify=False ) - print(response.json()) + print(response.json(), "\n") + + +def query_id(url: str) -> None: + url = url + "/query_id/" + + response = requests.get( + url=url, + verify=False + ) + print(response.json(), "\n") + + +def main(): + args = get_args() + url = "http://{}:{}".format(args.host, args.port) + + generate(url) + tokenize(url) + query_id(url) if (__name__ == "__main__"): diff --git a/scripts/bloom-inference-server/hf_accelerate/model.py b/scripts/bloom-inference-server/hf_accelerate/model.py index 2ce8ae020..1d58fd605 100644 --- a/scripts/bloom-inference-server/hf_accelerate/model.py +++ b/scripts/bloom-inference-server/hf_accelerate/model.py @@ -11,6 +11,7 @@ def __init__(self, args: Namespace) -> None: print_rank_n("Loading model...") self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.pad = self.tokenizer.pad_token_id self.model = AutoModelForCausalLM.from_pretrained( args.model_name, diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 55d36cd63..73ae041cd 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -1,17 +1,31 @@ import argparse import logging import sys -import time +import traceback import constants import utils from ds_inference import DSInferenceGRPCServer from fastapi import FastAPI, HTTPException from hf_accelerate import HFAccelerateModel -from utils import GenerateRequest, get_argument_parser, get_num_tokens_to_generate, get_stack_trace +from pydantic import BaseModel +from utils import ( + GenerateRequest, + GenerateResponse, + TokenizeRequest, + TokenizeResponse, + get_argument_parser, + get_num_tokens_to_generate, + run_and_log_time +) from uvicorn import run +class QueryID(BaseModel): + generate_query_id: int = 0 + tokenize_query_id: int = 0 + + def get_args() -> argparse.Namespace: parser = get_argument_parser() @@ -44,6 +58,28 @@ def get_args() -> argparse.Namespace: return args +def get_exception_response(query_id: int): + e_type, e_message, e_stack_trace = sys.exc_info() + response = { + "error": str(e_type.__name__), + "message": str(e_message), + "query_id": query_id + } + + if (args.debug): + trace_back = traceback.extract_tb(e_stack_trace) + + # Format stacktrace + stack_trace = [] + for trace in trace_back: + stack_trace.append("File : {}, Line : {}, Func.Name : {}, Message : {}".format( + trace[0], trace[1], trace[2], trace[3])) + + response["stack_trace"] = stack_trace + + return response + + #################################################################################### args = get_args() app = FastAPI() @@ -58,40 +94,52 @@ def get_args() -> argparse.Namespace: raise ValueError( f"Unknown deployment framework {args.deployment_framework}") -query_id = 0 +query_ids = QueryID() #################################################################################### @app.post("/generate/") -def generate(request: GenerateRequest) -> dict: - # needs to be global since it is updated - global query_id - +def generate(request: GenerateRequest) -> GenerateResponse: try: - start_time = time.time() - request.max_new_tokens = get_num_tokens_to_generate( request.max_new_tokens, args.allowed_max_new_tokens) - response = model.generate(request) - response.query_id = query_id - response.total_time_taken = time.time() - start_time + response, total_time_taken = run_and_log_time( + (model.generate, {"request": request}) + ) + + response.query_id = query_ids.generate_query_id + query_ids.generate_query_id += 1 + response.total_time_taken = total_time_taken - query_id += 1 return response except Exception: - e_type, e_message, e_stack_trace = sys.exc_info() - response = { - "error": str(e_type.__name__), - "message": str(e_message), - "query_id": query_id - } + response = get_exception_response(query_ids.generate_query_id) + query_ids.generate_query_id += 1 + raise HTTPException(500, response) + + +@app.post("/tokenize/") +def tokenize(request: TokenizeRequest) -> TokenizeResponse: + try: + response, total_time_taken = run_and_log_time( + (model.tokenize, {"request": request}) + ) - if (args.debug): - response["stack_trace"] = get_stack_trace(e_stack_trace) + response.query_id = query_ids.tokenize_query_id + query_ids.tokenize_query_id += 1 + response.total_time_taken = total_time_taken - query_id += 1 + return response + except Exception: + response = get_exception_response(query_ids.tokenize_query_id) + query_ids.tokenize_query_id += 1 raise HTTPException(500, response) +@app.get("/query_id/") +def query_id() -> QueryID: + return query_ids + + run(app, host=args.host, port=args.port, workers=args.workers) diff --git a/scripts/bloom-inference-server/utils/__init__.py b/scripts/bloom-inference-server/utils/__init__.py index bece92d2e..9ee67a87f 100644 --- a/scripts/bloom-inference-server/utils/__init__.py +++ b/scripts/bloom-inference-server/utils/__init__.py @@ -1,13 +1,20 @@ from .model import Model -from .requests import GenerateRequest, GenerateResponse, get_filter_dict, parse_generate_kwargs +from .requests import ( + GenerateRequest, + GenerateResponse, + TokenizeRequest, + TokenizeResponse, + get_filter_dict, + parse_generate_kwargs +) from .utils import ( - Execute, get_args, get_argument_parser, get_dummy_batch, get_num_tokens_to_generate, - get_stack_trace, get_str_dtype, + pad_ids, print_rank_n, + run_and_log_time, run_rank_n ) diff --git a/scripts/bloom-inference-server/utils/model.py b/scripts/bloom-inference-server/utils/model.py index 33da8d114..03bec99a3 100644 --- a/scripts/bloom-inference-server/utils/model.py +++ b/scripts/bloom-inference-server/utils/model.py @@ -2,31 +2,27 @@ import torch -from .requests import GenerateRequest, GenerateResponse +from .requests import GenerateRequest, GenerateResponse, TokenizeRequest, TokenizeResponse class Model: def __init__(self, args: argparse.Namespace) -> None: self.tokenizer = None + self.pad = None self.model = None self.input_device = None raise NotImplementedError("This is a dummy class") def generate(self, request: GenerateRequest) -> GenerateResponse: - text = request.text - - return_type = type(text) - if (return_type == str): - text = [text] - - input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) + input_tokens = self.tokenizer( + request.text, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): input_tokens[t] = input_tokens[t].to(self.input_device) with torch.no_grad(): - output_tokens = self.model.generate( + output = self.model.generate( **input_tokens, min_length=request.min_length, do_sample=request.do_sample, @@ -52,10 +48,11 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: forced_bos_token_id=request.forced_bos_token_id, forced_eos_token_id=request.forced_eos_token_id, exponential_decay_length_penalty=request.exponential_decay_length_penalty, - bad_words_ids=request.bad_words_ids, - force_words_ids=request.force_words_ids + return_dict_in_generate=True ) + output_tokens = output.sequences + input_token_lengths = [x.shape[0] for x in input_tokens.input_ids] output_token_lengths = [x.shape[0] for x in output_tokens] generated_tokens = [ @@ -68,14 +65,20 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: output_text = self.tokenizer.batch_decode( output_tokens, skip_special_tokens=True) - if (return_type == str): - output_text = output_text[0] - generated_tokens = generated_tokens[0] - return GenerateResponse( text=output_text, num_generated_tokens=generated_tokens ) + def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: + output = self.tokenizer( + request.text, + padding=request.padding + ) + return TokenizeResponse( + token_ids=output.input_ids, + attention_mask=output.attention_mask + ) + def shutdown(self) -> None: exit() diff --git a/scripts/bloom-inference-server/utils/requests.py b/scripts/bloom-inference-server/utils/requests.py index b8d8965e1..8f23dce3a 100644 --- a/scripts/bloom-inference-server/utils/requests.py +++ b/scripts/bloom-inference-server/utils/requests.py @@ -1,10 +1,10 @@ -from typing import Any, List, Union +from typing import Any, List from pydantic import BaseModel class GenerateRequest(BaseModel): - text: Union[List[str], str] + text: List[str] min_length: int = None do_sample: bool = None early_stopping: bool = None @@ -29,14 +29,24 @@ class GenerateRequest(BaseModel): forced_bos_token_id: int = None forced_eos_token_id: int = None exponential_decay_length_penalty: float = None - bad_words_ids: List[int] = None - force_words_ids: Union[List[int], List[List[int]]] = None remove_input_from_output: bool = False class GenerateResponse(BaseModel): - text: Union[List[str], str] = None - num_generated_tokens: Union[List[int], int] = None + text: List[str] = None + num_generated_tokens: List[int] = None + query_id: int = None + total_time_taken: float = None + + +class TokenizeRequest(BaseModel): + text: List[str] + padding: bool = False + + +class TokenizeResponse(BaseModel): + token_ids: List[List[int]] = None + attention_mask: List[List[int]] = None query_id: int = None total_time_taken: float = None @@ -65,7 +75,7 @@ def parse_field(kwargs: dict, return default_value -def parse_generate_kwargs(text: Union[List[str], str], kwargs: dict) -> GenerateRequest: +def parse_generate_kwargs(text: List[str], kwargs: dict) -> GenerateRequest: return GenerateRequest( text=text, min_length=parse_field(kwargs, "min_length", int), diff --git a/scripts/bloom-inference-server/utils/utils.py b/scripts/bloom-inference-server/utils/utils.py index 110e4b199..9fec36364 100644 --- a/scripts/bloom-inference-server/utils/utils.py +++ b/scripts/bloom-inference-server/utils/utils.py @@ -2,12 +2,14 @@ import copy import json import math -import traceback -from typing import Any, List +import time +from typing import Any, List, Tuple, Union import torch import torch.distributed as dist +from pydantic import BaseModel + dummy_input_sentences = [ "DeepSpeed is a machine learning framework", @@ -21,15 +23,6 @@ ] -class Execute: - def __init__(self, func: callable, kwargs: dict) -> None: - self.func = func - self.kwargs = kwargs - - def __call__(self) -> Any: - return self.func(**self.kwargs) - - def get_argument_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() @@ -115,13 +108,26 @@ def get_num_tokens_to_generate(max_new_tokens: int, return min(max_new_tokens, allowed_max_new_tokens) -def get_stack_trace(e_stack_trace): - trace_back = traceback.extract_tb(e_stack_trace) +def run_and_log_time(execs: Union[List[Tuple[callable, dict]], + Tuple[callable, dict]]) -> Tuple[Union[List[Any], Any], float]: + start_time = time.time() + + if (type(execs) == list): + results = [] + for f, k in execs: + results.append(f(**k)) + else: + results = execs[0](**execs[1]) + + time_elapsed = time.time() - start_time + return results, time_elapsed + + +def pad_ids(arrays, padding, max_length=-1): + if (max_length < 0): + max_length = max(list(map(len, arrays))) - # Format stacktrace - stack_trace = [] - for trace in trace_back: - stack_trace.append("File : {}, Line : {}, Func.Name : {}, Message : {}".format( - trace[0], trace[1], trace[2], trace[3])) + arrays = [[padding] * (max_length - len(array)) + + array for array in arrays] - return stack_trace + return arrays From 11d50f18ac17173f5288c42eca6bf933719c6503 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Wed, 24 Aug 2022 23:17:11 +0530 Subject: [PATCH 26/32] default values --- .../bloom-inference-server/ds_inference/grpc_server.py | 8 +------- scripts/bloom-inference-server/server.py | 5 +++-- scripts/bloom-inference-server/utils/requests.py | 4 ++-- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/scripts/bloom-inference-server/ds_inference/grpc_server.py b/scripts/bloom-inference-server/ds_inference/grpc_server.py index bee503ef4..ec502b43c 100644 --- a/scripts/bloom-inference-server/ds_inference/grpc_server.py +++ b/scripts/bloom-inference-server/ds_inference/grpc_server.py @@ -6,13 +6,7 @@ from transformers import AutoTokenizer import mii -from utils import ( - GenerateRequest, - GenerateResponse, - Model, - get_filter_dict, - get_str_dtype -) +from utils import GenerateRequest, GenerateResponse, Model, get_filter_dict, get_str_dtype class DSInferenceGRPCServer(Model): diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 73ae041cd..430fa4ef1 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -110,7 +110,7 @@ def generate(request: GenerateRequest) -> GenerateResponse: response.query_id = query_ids.generate_query_id query_ids.generate_query_id += 1 - response.total_time_taken = total_time_taken + response.total_time_taken = "{:.2f} secs".format(total_time_taken) return response except Exception: @@ -128,7 +128,8 @@ def tokenize(request: TokenizeRequest) -> TokenizeResponse: response.query_id = query_ids.tokenize_query_id query_ids.tokenize_query_id += 1 - response.total_time_taken = total_time_taken + response.total_time_taken = "{:.2f} msecs".format( + total_time_taken * 1000) return response except Exception: diff --git a/scripts/bloom-inference-server/utils/requests.py b/scripts/bloom-inference-server/utils/requests.py index 8f23dce3a..c8fab3700 100644 --- a/scripts/bloom-inference-server/utils/requests.py +++ b/scripts/bloom-inference-server/utils/requests.py @@ -36,7 +36,7 @@ class GenerateResponse(BaseModel): text: List[str] = None num_generated_tokens: List[int] = None query_id: int = None - total_time_taken: float = None + total_time_taken: str = None class TokenizeRequest(BaseModel): @@ -48,7 +48,7 @@ class TokenizeResponse(BaseModel): token_ids: List[List[int]] = None attention_mask: List[List[int]] = None query_id: int = None - total_time_taken: float = None + total_time_taken: str = None def parse_bool(value: str) -> bool: From 403424bde0aa7f0dbd5d677a8d09d9813b3a9853 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 25 Aug 2022 04:34:13 +0530 Subject: [PATCH 27/32] resolve issues --- scripts/bloom-inference-server/cli.py | 45 +++++++++------- .../ds_inference/grpc_server.py | 7 ++- scripts/bloom-inference-server/server.py | 53 +++++++++++-------- .../bloom-inference-server/utils/requests.py | 21 +++++--- 4 files changed, 73 insertions(+), 53 deletions(-) diff --git a/scripts/bloom-inference-server/cli.py b/scripts/bloom-inference-server/cli.py index ea170e25d..ac5f6cce1 100644 --- a/scripts/bloom-inference-server/cli.py +++ b/scripts/bloom-inference-server/cli.py @@ -53,28 +53,33 @@ def main() -> None: # fine but might need to run_rank_n for this # if running a deployment_framework with # multiple processes - input_text = input("Input text: ") - - if (input_text == args.shutdown_command): + try: + input_text = input("Input text: ") + + if (input_text == args.shutdown_command): + model.shutdown() + + if (input("change generate_kwargs? [y/n] ") == "y"): + while (True): + try: + generate_kwargs = json.loads(input("Generate kwargs: ")) + break + except KeyboardInterrupt: + model.shutdown() + except Exception as e: + e_type, e_message, _ = sys.exc_info() + print("error =", e_type.__name__) + print("message =", e_message) + continue + + request = parse_generate_kwargs(input_text, generate_kwargs) + response = model.generate(request) + + print_rank_n("Output text:", response.text) + print_rank_n("Generated tokens:", response.num_generated_tokens) + except KeyboardInterrupt: model.shutdown() - if (input("change generate_kwargs? [y/n] ") == "y"): - while (True): - try: - generate_kwargs = json.loads(input("Generate kwargs: ")) - break - except Exception as e: - e_type, e_message, _ = sys.exc_info() - print("error =", e_type.__name__) - print("message =", e_message) - continue - - request = parse_generate_kwargs(input_text, generate_kwargs) - response = model.generate(request) - - print_rank_n("Output text:", response.text) - print_rank_n("Generated tokens:", response.num_generated_tokens) - if (__name__ == "__main__"): main() diff --git a/scripts/bloom-inference-server/ds_inference/grpc_server.py b/scripts/bloom-inference-server/ds_inference/grpc_server.py index ec502b43c..7c2ed7019 100644 --- a/scripts/bloom-inference-server/ds_inference/grpc_server.py +++ b/scripts/bloom-inference-server/ds_inference/grpc_server.py @@ -82,5 +82,8 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: ) def shutdown(self) -> None: - mii.terminate(self.deployment_name) - exit() + # MII is buggy and sometimes spits out an error in terminate + try: + mii.terminate(self.deployment_name) + except Exception: + exit() diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 430fa4ef1..91099e528 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -1,5 +1,6 @@ import argparse import logging +import os import sys import traceback @@ -58,12 +59,31 @@ def get_args() -> argparse.Namespace: return args -def get_exception_response(query_id: int): +#################################################################################### +args = get_args() +app = FastAPI() + +logger = logging.getLogger(__name__) + +if (args.deployment_framework == constants.HF_ACCELERATE): + model = HFAccelerateModel(args) +elif (args.deployment_framework == constants.DS_INFERENCE): + model = DSInferenceGRPCServer(args) +else: + raise ValueError( + f"Unknown deployment framework {args.deployment_framework}") + +query_ids = QueryID() +#################################################################################### + + +def get_exception_response(query_id: int, method: str): e_type, e_message, e_stack_trace = sys.exc_info() response = { "error": str(e_type.__name__), "message": str(e_message), - "query_id": query_id + "query_id": query_id, + "method": method } if (args.debug): @@ -80,24 +100,6 @@ def get_exception_response(query_id: int): return response -#################################################################################### -args = get_args() -app = FastAPI() - -logger = logging.getLogger(__name__) - -if (args.deployment_framework == constants.HF_ACCELERATE): - model = HFAccelerateModel(args) -elif (args.deployment_framework == constants.DS_INFERENCE): - model = DSInferenceGRPCServer(args) -else: - raise ValueError( - f"Unknown deployment framework {args.deployment_framework}") - -query_ids = QueryID() -#################################################################################### - - @app.post("/generate/") def generate(request: GenerateRequest) -> GenerateResponse: try: @@ -114,7 +116,8 @@ def generate(request: GenerateRequest) -> GenerateResponse: return response except Exception: - response = get_exception_response(query_ids.generate_query_id) + response = get_exception_response( + query_ids.generate_query_id, request.method) query_ids.generate_query_id += 1 raise HTTPException(500, response) @@ -133,7 +136,8 @@ def tokenize(request: TokenizeRequest) -> TokenizeResponse: return response except Exception: - response = get_exception_response(query_ids.tokenize_query_id) + response = get_exception_response( + query_ids.tokenize_query_id, request.method) query_ids.tokenize_query_id += 1 raise HTTPException(500, response) @@ -143,4 +147,7 @@ def query_id() -> QueryID: return query_ids -run(app, host=args.host, port=args.port, workers=args.workers) +try: + run(app, host=args.host, port=args.port, workers=args.workers) +except KeyboardInterrupt: + model.shutdown() diff --git a/scripts/bloom-inference-server/utils/requests.py b/scripts/bloom-inference-server/utils/requests.py index c8fab3700..5832d6ae2 100644 --- a/scripts/bloom-inference-server/utils/requests.py +++ b/scripts/bloom-inference-server/utils/requests.py @@ -3,8 +3,13 @@ from pydantic import BaseModel +class BaseResponse(BaseModel): + query_id: int = None + total_time_taken: str = None + + class GenerateRequest(BaseModel): - text: List[str] + text: List[str] = None min_length: int = None do_sample: bool = None early_stopping: bool = None @@ -30,25 +35,25 @@ class GenerateRequest(BaseModel): forced_eos_token_id: int = None exponential_decay_length_penalty: float = None remove_input_from_output: bool = False + method: str = "generate" -class GenerateResponse(BaseModel): +class GenerateResponse(BaseResponse): text: List[str] = None num_generated_tokens: List[int] = None - query_id: int = None - total_time_taken: str = None + method: str = "generate" class TokenizeRequest(BaseModel): - text: List[str] + text: List[str] = None padding: bool = False + method: str = "tokenize" -class TokenizeResponse(BaseModel): +class TokenizeResponse(BaseResponse): token_ids: List[List[int]] = None attention_mask: List[List[int]] = None - query_id: int = None - total_time_taken: str = None + method: str = "tokenize" def parse_bool(value: str) -> bool: From 493b2ee0b63b96b60a0a7bd5ec4b90c68aaab3cd Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 25 Aug 2022 05:44:14 +0530 Subject: [PATCH 28/32] handle keyboard interrupt --- scripts/bloom-inference-server/cli.py | 9 +++++---- .../bloom-inference-server/ds_inference/grpc_server.py | 3 ++- scripts/bloom-inference-server/utils/model.py | 2 ++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/bloom-inference-server/cli.py b/scripts/bloom-inference-server/cli.py index ac5f6cce1..ced0ef4ff 100644 --- a/scripts/bloom-inference-server/cli.py +++ b/scripts/bloom-inference-server/cli.py @@ -62,7 +62,8 @@ def main() -> None: if (input("change generate_kwargs? [y/n] ") == "y"): while (True): try: - generate_kwargs = json.loads(input("Generate kwargs: ")) + generate_kwargs = json.loads( + input("Generate kwargs: ")) break except KeyboardInterrupt: model.shutdown() @@ -72,11 +73,11 @@ def main() -> None: print("message =", e_message) continue - request = parse_generate_kwargs(input_text, generate_kwargs) + request = parse_generate_kwargs([input_text], generate_kwargs) response = model.generate(request) - print_rank_n("Output text:", response.text) - print_rank_n("Generated tokens:", response.num_generated_tokens) + print_rank_n("Output text:", response.text[0]) + print_rank_n("Generated tokens:", response.num_generated_tokens[0]) except KeyboardInterrupt: model.shutdown() diff --git a/scripts/bloom-inference-server/ds_inference/grpc_server.py b/scripts/bloom-inference-server/ds_inference/grpc_server.py index 7c2ed7019..47ab08c73 100644 --- a/scripts/bloom-inference-server/ds_inference/grpc_server.py +++ b/scripts/bloom-inference-server/ds_inference/grpc_server.py @@ -6,7 +6,7 @@ from transformers import AutoTokenizer import mii -from utils import GenerateRequest, GenerateResponse, Model, get_filter_dict, get_str_dtype +from utils import GenerateRequest, GenerateResponse, Model, get_filter_dict, get_str_dtype, print_rank_n class DSInferenceGRPCServer(Model): @@ -82,6 +82,7 @@ def generate(self, request: GenerateRequest) -> GenerateResponse: ) def shutdown(self) -> None: + print_rank_n("shutting down") # MII is buggy and sometimes spits out an error in terminate try: mii.terminate(self.deployment_name) diff --git a/scripts/bloom-inference-server/utils/model.py b/scripts/bloom-inference-server/utils/model.py index 03bec99a3..e437d32eb 100644 --- a/scripts/bloom-inference-server/utils/model.py +++ b/scripts/bloom-inference-server/utils/model.py @@ -3,6 +3,7 @@ import torch from .requests import GenerateRequest, GenerateResponse, TokenizeRequest, TokenizeResponse +from .utils import print_rank_n class Model: @@ -81,4 +82,5 @@ def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: ) def shutdown(self) -> None: + print_rank_n("shutting down") exit() From c84d9b77cfbfaab6ec005c2760632cb0d87fb51f Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Sat, 27 Aug 2022 15:26:04 +0530 Subject: [PATCH 29/32] remove caching --- scripts/bloom-inference-server/benchmark.py | 29 ++-- .../cache_ds_checkpoints.py | 27 --- scripts/bloom-inference-server/cli.py | 18 +- scripts/bloom-inference-server/constants.py | 3 - .../ds_inference/__init__.py | 1 - .../ds_inference/cache.py | 68 -------- .../ds_inference/model.py | 158 ++++++++---------- scripts/bloom-inference-server/server.py | 24 +-- .../bloom-inference-server/utils/__init__.py | 1 + .../bloom-inference-server/utils/constants.py | 40 +++++ scripts/bloom-inference-server/utils/utils.py | 52 +++++- 11 files changed, 185 insertions(+), 236 deletions(-) delete mode 100644 scripts/bloom-inference-server/cache_ds_checkpoints.py delete mode 100644 scripts/bloom-inference-server/constants.py delete mode 100644 scripts/bloom-inference-server/ds_inference/cache.py create mode 100644 scripts/bloom-inference-server/utils/constants.py diff --git a/scripts/bloom-inference-server/benchmark.py b/scripts/bloom-inference-server/benchmark.py index af260c99c..452a9d3a4 100644 --- a/scripts/bloom-inference-server/benchmark.py +++ b/scripts/bloom-inference-server/benchmark.py @@ -4,12 +4,14 @@ import torch -import constants import utils from ds_inference import DSInferenceModel from ds_zero import DSZeROModel from hf_accelerate import HFAccelerateModel from utils import ( + DS_INFERENCE, + DS_ZERO, + HF_ACCELERATE, GenerateRequest, Model, get_argument_parser, @@ -61,7 +63,7 @@ def benchmark_end_to_end(args: argparse.Namespace, args.generate_kwargs ) - print_rank_n(f"generate_kwargs = {request}") + print_rank_n(f"generate_kwargs = {args.generate_kwargs}") print_rank_n(f"batch_size = {args.batch_size}") # warmup is a must if measuring speed as it's when all the optimizations are performed @@ -117,35 +119,30 @@ def get_args() -> argparse.Namespace: "--deployment_framework", type=str, choices=[ - constants.HF_ACCELERATE, - constants.DS_INFERENCE, - constants.DS_ZERO + HF_ACCELERATE, + DS_INFERENCE, + DS_ZERO ], - default=constants.HF_ACCELERATE + default=HF_ACCELERATE ) group.add_argument("--benchmark_cycles", type=int, default=0, help="additionally run benchmark") group.add_argument("--local_rank", required=False, type=int, help="used by dist launchers") group.add_argument("--batch_size", default=1, type=int, help="batch size") - group.add_argument("--save_mp_checkpoint_path", required=False, - type=str, help="MP checkpoints path for DS inference") group.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload for DS ZeRO") args = utils.get_args(parser) launched_with_deepspeed = args.deployment_framework in [ - constants.DS_INFERENCE, constants.DS_ZERO] + DS_INFERENCE, DS_ZERO] if (not launched_with_deepspeed): assert args.local_rank == None, "local_rank must be None if not launched with DeepSpeed" - if (args.save_mp_checkpoint_path): - assert args.deployment_framework == constants.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" - if (args.cpu_offload): - assert args.deployment_framework == constants.DS_ZERO, "cpu_offload only works with DS_ZeRO" + assert args.deployment_framework == DS_ZERO, "cpu_offload only works with DS_ZeRO" return args @@ -153,11 +150,11 @@ def get_args() -> argparse.Namespace: def main() -> None: args = get_args() - if (args.deployment_framework == constants.HF_ACCELERATE): + if (args.deployment_framework == HF_ACCELERATE): benchmark_end_to_end(args, HFAccelerateModel) - elif (args.deployment_framework == constants.DS_INFERENCE): + elif (args.deployment_framework == DS_INFERENCE): benchmark_end_to_end(args, DSInferenceModel) - elif (args.deployment_framework == constants.DS_ZERO): + elif (args.deployment_framework == DS_ZERO): benchmark_end_to_end(args, DSZeROModel, zero_activated=True) else: raise ValueError( diff --git a/scripts/bloom-inference-server/cache_ds_checkpoints.py b/scripts/bloom-inference-server/cache_ds_checkpoints.py deleted file mode 100644 index f3a75c3dd..000000000 --- a/scripts/bloom-inference-server/cache_ds_checkpoints.py +++ /dev/null @@ -1,27 +0,0 @@ -import argparse - -import utils -from ds_inference import cache_ds_checkpoints -from utils import get_argument_parser - - -def get_args() -> argparse.Namespace: - parser = get_argument_parser() - - group = parser.add_argument_group(title="launch config") - group.add_argument("--local_rank", required=False, - type=int, help="used by dist launchers") - group.add_argument("--save_mp_checkpoint_path", required=True, - type=str, help="MP checkpoints path for DS inference") - - args = utils.get_args(parser) - - return args - - -def main() -> None: - cache_ds_checkpoints(get_args()) - - -if (__name__ == "__main__"): - main() diff --git a/scripts/bloom-inference-server/cli.py b/scripts/bloom-inference-server/cli.py index ced0ef4ff..1036c0439 100644 --- a/scripts/bloom-inference-server/cli.py +++ b/scripts/bloom-inference-server/cli.py @@ -2,11 +2,10 @@ import json import sys -import constants import utils from ds_inference import DSInferenceGRPCServer from hf_accelerate import HFAccelerateModel -from utils import get_argument_parser, parse_generate_kwargs, print_rank_n +from utils import DS_INFERENCE, HF_ACCELERATE, get_argument_parser, parse_generate_kwargs, print_rank_n def get_args() -> argparse.Namespace: @@ -17,30 +16,25 @@ def get_args() -> argparse.Namespace: "--deployment_framework", type=str, choices=[ - constants.HF_ACCELERATE, - constants.DS_INFERENCE + HF_ACCELERATE, + DS_INFERENCE ], - default=constants.HF_ACCELERATE + default=HF_ACCELERATE ) - group.add_argument("--save_mp_checkpoint_path", required=False, - type=str, help="MP checkpoints path for DS inference") group.add_argument("--shutdown_command", required=False, type=str, default="__shutdown__", help="This string will exit the script") args = utils.get_args(parser) - if (args.save_mp_checkpoint_path): - assert args.deployment_framework == constants.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" - return args def main() -> None: args = get_args() - if (args.deployment_framework == constants.HF_ACCELERATE): + if (args.deployment_framework == HF_ACCELERATE): model = HFAccelerateModel(args) - elif (args.deployment_framework == constants.DS_INFERENCE): + elif (args.deployment_framework == DS_INFERENCE): model = DSInferenceGRPCServer(args) else: raise ValueError( diff --git a/scripts/bloom-inference-server/constants.py b/scripts/bloom-inference-server/constants.py deleted file mode 100644 index 8c16f280c..000000000 --- a/scripts/bloom-inference-server/constants.py +++ /dev/null @@ -1,3 +0,0 @@ -HF_ACCELERATE = "hf_accelerate" -DS_INFERENCE = "ds_inference" -DS_ZERO = "ds_zero" diff --git a/scripts/bloom-inference-server/ds_inference/__init__.py b/scripts/bloom-inference-server/ds_inference/__init__.py index 654495525..47e0181df 100644 --- a/scripts/bloom-inference-server/ds_inference/__init__.py +++ b/scripts/bloom-inference-server/ds_inference/__init__.py @@ -1,3 +1,2 @@ -from .cache import cache_ds_checkpoints from .grpc_server import DSInferenceGRPCServer from .model import DSInferenceModel diff --git a/scripts/bloom-inference-server/ds_inference/cache.py b/scripts/bloom-inference-server/ds_inference/cache.py deleted file mode 100644 index cbbf1b6f6..000000000 --- a/scripts/bloom-inference-server/ds_inference/cache.py +++ /dev/null @@ -1,68 +0,0 @@ -import argparse -import os -import shutil - -import deepspeed -import torch -from transformers import AutoConfig, AutoModelForCausalLM - -from utils import print_rank_n, run_rank_n - -from .model import write_checkponts_json - - -def cache_ds_checkpoints(args: argparse.Namespace) -> None: - if (args.local_rank == 0): - print_rank_n("Loading model...") - world_size = int(os.getenv("WORLD_SIZE", "1")) - - # Load model - with deepspeed.OnDevice(dtype=args.dtype, device="meta"): - model = AutoModelForCausalLM.from_config( - AutoConfig.from_pretrained(args.model_name), - torch_dtype=torch.bfloat16 - ) - model = model.eval() - - # Write checkpoints.json - tmp_directory = "tmp" - run_rank_n( - os.makedirs, - { - "name": tmp_directory, - "exist_ok": True - } - ) - checkpoints_json = os.path.join(tmp_directory, "checkpoints.json") - run_rank_n( - write_checkponts_json, - { - "checkpoints_json": checkpoints_json, - "model_name": args.model_name - }, - barrier=True - ) - - run_rank_n( - os.makedirs, - { - "name": args.save_mp_checkpoint_path, - "exist_ok": True - }, - barrier=True - ) - - if (args.dtype == torch.float16): - model = deepspeed.init_inference( - model, - mp_size=world_size, - dtype=args.dtype, - checkpoint=checkpoints_json, - replace_with_kernel_inject=True, - save_mp_checkpoint_path=args.save_mp_checkpoint_path - ) - elif (args.dtype == torch.bfloat16): - raise NotImplementedError("bfloat16 is not yet supported") - - run_rank_n(shutil.rmtree, {"path": tmp_directory}) - print_rank_n("Model loaded") diff --git a/scripts/bloom-inference-server/ds_inference/model.py b/scripts/bloom-inference-server/ds_inference/model.py index d31b39c6a..fdbd6e552 100644 --- a/scripts/bloom-inference-server/ds_inference/model.py +++ b/scripts/bloom-inference-server/ds_inference/model.py @@ -3,14 +3,14 @@ import os import shutil from argparse import Namespace +from pathlib import Path import deepspeed import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from transformers.modeling_utils import get_checkpoint_shard_files -from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_offline_mode -from transformers.utils.hub import EntryNotFoundError +from transformers.utils import is_offline_mode +from huggingface_hub import snapshot_download from utils import Model, print_rank_n, run_rank_n @@ -25,115 +25,91 @@ def __init__(self, args: Namespace) -> None: self.pad = self.tokenizer.pad_token_id # Load model - with deepspeed.OnDevice(dtype=args.dtype, device="meta"): + with deepspeed.OnDevice(dtype=torch.float16, device="meta"): self.model = AutoModelForCausalLM.from_config( AutoConfig.from_pretrained(args.model_name), torch_dtype=torch.bfloat16 ) self.model = self.model.eval() - # Write checkpoints.json - tmp_directory = "tmp" - run_rank_n( - os.makedirs, - { - "name": tmp_directory, - "exist_ok": True - } - ) - checkpoints_json = os.path.join(tmp_directory, "checkpoints.json") - run_rank_n( - write_checkponts_json, - { - "checkpoints_json": checkpoints_json, - "model_name": args.model_name - }, - barrier=True - ) - - if (args.save_mp_checkpoint_path): - checkpoints_json = os.path.join( - args.save_mp_checkpoint_path, "BLOOM-176B_ds-inference_config.json") - - if (args.dtype == torch.float16): - self.model = deepspeed.init_inference( - self.model, - mp_size=world_size, - dtype=args.dtype, - checkpoint=checkpoints_json, - replace_with_kernel_inject=True - ) + if (args.dtype in [torch.float16, torch.int8]): + if (args.use_pre_sharded_checkpoints): + model_path = snapshot_download( + args.model_name, + allow_patterns=["*"], + local_files_only=is_offline_mode(), + revision=None + ) + checkpoints_json = os.path.join( + model_path, "BLOOM_ds-inference_config.json") + + self.model = deepspeed.init_inference( + self.model, + mp_size=world_size, + base_dir=model_path, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True + ) + else: + # Write checkpoints.json + tmp_directory = "tmp" + run_rank_n( + os.makedirs, + { + "name": tmp_directory, + "exist_ok": True + } + ) + checkpoints_json = os.path.join( + tmp_directory, "checkpoints.json") + run_rank_n( + write_checkponts_json, + { + "checkpoints_json": checkpoints_json, + "model_name": args.model_name + }, + barrier=True + ) + + self.model = deepspeed.init_inference( + self.model, + mp_size=world_size, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True + ) + + run_rank_n(shutil.rmtree, {"path": tmp_directory}) elif (args.dtype == torch.bfloat16): raise NotImplementedError("bfloat16 is not yet supported") - run_rank_n(shutil.rmtree, {"path": tmp_directory}) - self.model = self.model.module self.input_device = torch.cuda.current_device() print_rank_n("Model loaded") -def get_checkpoint_files(pretrained_model_name_or_path): - # XXX: I just hacked this one together to automatically handle the fetching of the model file or - # shards into cache and returning the cached entries - note that I removed most arguments - cache_dir = None - is_sharded = False - - # XXX: preparation for revision branches if needed - revision = None - #revision = "sharded" - - # this supports nodes with no network (so you need to pre-cache the model and the tokenizer with - # python -c "from transformers import AutoModel; AutoModel.from_pretrained('bigscience/bloom')" - if (is_offline_mode()): - print("Offline mode: forcing local_files_only=True") - local_files_only = True - else: - local_files_only = False - - filename = WEIGHTS_NAME - archive_file = hf_bucket_url( - pretrained_model_name_or_path, filename=filename, revision=revision) - - try: - resolved_archive_file = cached_path( - archive_file, cache_dir=cache_dir, local_files_only=local_files_only,) - return [resolved_archive_file] - except (EntryNotFoundError, FileNotFoundError): - if filename == WEIGHTS_NAME: - # Maybe the checkpoint is sharded, we try to grab the index name in this case. - archive_file = hf_bucket_url( - pretrained_model_name_or_path, - filename=WEIGHTS_INDEX_NAME, - revision=revision, - ) - resolved_archive_file = cached_path( - archive_file, - cache_dir=cache_dir, - local_files_only=local_files_only, - ) - is_sharded = True - - if (is_sharded): - # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. - resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( - pretrained_model_name_or_path, - resolved_archive_file, - cache_dir=cache_dir, - revision=revision - ) +def get_checkpoint_files(model_name_or_path, revision=None): + # loads files from hub + cached_repo_dir = snapshot_download( + model_name_or_path, + allow_patterns=["*"], + local_files_only=is_offline_mode(), + revision=revision + ) - return resolved_archive_file + # creates a list of paths from all downloaded files in cache dir, matching the regex *.pt + file_list = [str(entry) for entry in Path( + cached_repo_dir).rglob('*.pt') if entry.is_file()] + return file_list def write_checkponts_json(checkpoints_json: str, model_name: str) -> None: - with io.open(checkpoints_json, 'w', encoding='utf-8') as f: - #checkpoint_dir = "/gpfsscratch/rech/six/commun/uan68tv-model-conversion/bloom" - #checkpoint_files = glob.glob(f"{checkpoint_dir}/*bin") + with io.open(checkpoints_json, "w", encoding="utf-8") as f: checkpoint_files = get_checkpoint_files(model_name) data = { - "type": "BLOOM-176B", + "type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0 } diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 91099e528..aa3d7e72c 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -1,16 +1,17 @@ import argparse -import logging -import os import sys import traceback -import constants import utils from ds_inference import DSInferenceGRPCServer from fastapi import FastAPI, HTTPException from hf_accelerate import HFAccelerateModel from pydantic import BaseModel from utils import ( + DS_INFERENCE, + HF_ACCELERATE, + ForwardRequest, + ForwardResponse, GenerateRequest, GenerateResponse, TokenizeRequest, @@ -35,13 +36,11 @@ def get_args() -> argparse.Namespace: "--deployment_framework", type=str, choices=[ - constants.HF_ACCELERATE, - constants.DS_INFERENCE, + HF_ACCELERATE, + DS_INFERENCE, ], - default=constants.HF_ACCELERATE + default=HF_ACCELERATE ) - group.add_argument("--save_mp_checkpoint_path", required=False, - type=str, help="MP checkpoints path for DS inference") group.add_argument("--host", type=str, required=True, help="host address") group.add_argument("--port", type=int, required=True, help="port number") group.add_argument("--workers", type=int, default=1, @@ -53,9 +52,6 @@ def get_args() -> argparse.Namespace: args = utils.get_args(parser) - if (args.save_mp_checkpoint_path): - assert args.deployment_framework == constants.DS_INFERENCE, "save_mp_checkpoint_path only works with DS inference" - return args @@ -63,11 +59,9 @@ def get_args() -> argparse.Namespace: args = get_args() app = FastAPI() -logger = logging.getLogger(__name__) - -if (args.deployment_framework == constants.HF_ACCELERATE): +if (args.deployment_framework == HF_ACCELERATE): model = HFAccelerateModel(args) -elif (args.deployment_framework == constants.DS_INFERENCE): +elif (args.deployment_framework == DS_INFERENCE): model = DSInferenceGRPCServer(args) else: raise ValueError( diff --git a/scripts/bloom-inference-server/utils/__init__.py b/scripts/bloom-inference-server/utils/__init__.py index 9ee67a87f..96f4cf293 100644 --- a/scripts/bloom-inference-server/utils/__init__.py +++ b/scripts/bloom-inference-server/utils/__init__.py @@ -1,3 +1,4 @@ +from .constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE from .model import Model from .requests import ( GenerateRequest, diff --git a/scripts/bloom-inference-server/utils/constants.py b/scripts/bloom-inference-server/utils/constants.py new file mode 100644 index 000000000..e870431f9 --- /dev/null +++ b/scripts/bloom-inference-server/utils/constants.py @@ -0,0 +1,40 @@ +from pickle import TRUE + + +HF_ACCELERATE = "hf_accelerate" +DS_INFERENCE = "ds_inference" +DS_ZERO = "ds_zero" + +BIGSCIENCE_BLOOM = "bigscience/bloom" +DS_INFERENCE_BLOOM_FP16 = "microsoft/bloom-deepspeed-inference-fp16" +DS_INFERENCE_BLOOM_INT8 = "microsoft/bloom-deepspeed-inference-int8" + +BF16 = "bf16" +FP16 = "fp16" +INT8 = "int8" + +FRAMEWORK_MODEL_DTYPE_ALLOWED = { + HF_ACCELERATE: { + BIGSCIENCE_BLOOM: { + BF16, + FP16 + } + }, + DS_INFERENCE: { + BIGSCIENCE_BLOOM: { + FP16 + }, + DS_INFERENCE_BLOOM_FP16: { + FP16 + }, + DS_INFERENCE_BLOOM_INT8: { + INT8 + } + }, + DS_ZERO: { + BIGSCIENCE_BLOOM: { + BF16, + FP16 + } + } +} diff --git a/scripts/bloom-inference-server/utils/utils.py b/scripts/bloom-inference-server/utils/utils.py index 9fec36364..6d71df59d 100644 --- a/scripts/bloom-inference-server/utils/utils.py +++ b/scripts/bloom-inference-server/utils/utils.py @@ -10,6 +10,13 @@ from pydantic import BaseModel +from .constants import ( + BIGSCIENCE_BLOOM, + DS_INFERENCE_BLOOM_FP16, + DS_INFERENCE_BLOOM_INT8, + FRAMEWORK_MODEL_DTYPE_ALLOWED, +) + dummy_input_sentences = [ "DeepSpeed is a machine learning framework", @@ -27,10 +34,19 @@ def get_argument_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() group = parser.add_argument_group(title="model") - group.add_argument("--model_name", type=str, - required=True, help="model to use") + group.add_argument( + "--model_name", + type=str, + required=True, + choices=[ + BIGSCIENCE_BLOOM, + DS_INFERENCE_BLOOM_FP16, + DS_INFERENCE_BLOOM_INT8 + ], + help="model to use" + ) group.add_argument("--dtype", type=str, required=True, - choices=["bf16", "fp16"], help="dtype for model") + choices=["bf16", "fp16", "int8"], help="dtype for model") group.add_argument( "--generate_kwargs", type=str, @@ -43,8 +59,17 @@ def get_argument_parser() -> argparse.ArgumentParser: def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args = parser.parse_args() + + assert is_framework_model_dtype_allowed( + args.deployment_framework, + args.model_name, + args.dtype + ), "unsupported deployment_framework, model_name and dtype" + args.dtype = get_torch_dtype(args.dtype) args.generate_kwargs = json.loads(args.generate_kwargs) + args.use_pre_sharded_checkpoints = args.model_name in [ + DS_INFERENCE_BLOOM_FP16, DS_INFERENCE_BLOOM_INT8] return args @@ -75,11 +100,22 @@ def print_rank_n(*values, rank: int = 0) -> None: print(*values) +def get_dtype_from_model_name(model_name: str) -> str: + if (model_name == BIGSCIENCE_BLOOM): + return "bf16" + elif (model_name == DS_INFERENCE_BLOOM_FP16): + return "fp16" + elif (model_name == DS_INFERENCE_BLOOM_INT8): + return "int8" + + def get_torch_dtype(dtype_str: str) -> torch.dtype: if (dtype_str == "bf16"): return torch.bfloat16 elif (dtype_str == "fp16"): return torch.float16 + elif (dtype_str == "int8"): + return torch.int8 def get_str_dtype(dtype_str: str) -> torch.dtype: @@ -87,6 +123,8 @@ def get_str_dtype(dtype_str: str) -> torch.dtype: return "bf16" elif (dtype_str == torch.float16): return "fp16" + elif (dtype_str == torch.int8): + return "int8" def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[str]: @@ -131,3 +169,11 @@ def pad_ids(arrays, padding, max_length=-1): array for array in arrays] return arrays + + +def is_framework_model_dtype_allowed(deployment_framework: str, model_name: str, dtype: str) -> bool: + if (deployment_framework in FRAMEWORK_MODEL_DTYPE_ALLOWED): + if (model_name in FRAMEWORK_MODEL_DTYPE_ALLOWED[deployment_framework]): + if (dtype in FRAMEWORK_MODEL_DTYPE_ALLOWED[deployment_framework][model_name]): + return True + return False From 81d146927fc3e16cb2e05c23efaf641a7fc23b8e Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Mon, 29 Aug 2022 08:51:58 +0530 Subject: [PATCH 30/32] use snapshot_download --- scripts/bloom-inference-server/benchmark.py | 13 +- scripts/bloom-inference-server/cli.py | 17 +-- .../ds_inference/model.py | 118 ++++++++---------- .../bloom-inference-server/ds_zero/model.py | 10 +- .../hf_accelerate/model.py | 9 +- scripts/bloom-inference-server/server.py | 14 +-- .../bloom-inference-server/utils/__init__.py | 4 +- .../bloom-inference-server/utils/constants.py | 74 ++++++++--- scripts/bloom-inference-server/utils/model.py | 13 ++ scripts/bloom-inference-server/utils/utils.py | 38 ++++-- 10 files changed, 166 insertions(+), 144 deletions(-) diff --git a/scripts/bloom-inference-server/benchmark.py b/scripts/bloom-inference-server/benchmark.py index 452a9d3a4..729ff58e7 100644 --- a/scripts/bloom-inference-server/benchmark.py +++ b/scripts/bloom-inference-server/benchmark.py @@ -9,6 +9,7 @@ from ds_zero import DSZeROModel from hf_accelerate import HFAccelerateModel from utils import ( + BENCHMARK, DS_INFERENCE, DS_ZERO, HF_ACCELERATE, @@ -115,16 +116,6 @@ def get_args() -> argparse.Namespace: parser = get_argument_parser() group = parser.add_argument_group(title="launch config") - group.add_argument( - "--deployment_framework", - type=str, - choices=[ - HF_ACCELERATE, - DS_INFERENCE, - DS_ZERO - ], - default=HF_ACCELERATE - ) group.add_argument("--benchmark_cycles", type=int, default=0, help="additionally run benchmark") group.add_argument("--local_rank", required=False, @@ -133,7 +124,7 @@ def get_args() -> argparse.Namespace: group.add_argument("--cpu_offload", action="store_true", help="whether to activate CPU offload for DS ZeRO") - args = utils.get_args(parser) + args = utils.get_args(parser, BENCHMARK) launched_with_deepspeed = args.deployment_framework in [ DS_INFERENCE, DS_ZERO] diff --git a/scripts/bloom-inference-server/cli.py b/scripts/bloom-inference-server/cli.py index 1036c0439..477a0ad2e 100644 --- a/scripts/bloom-inference-server/cli.py +++ b/scripts/bloom-inference-server/cli.py @@ -5,26 +5,17 @@ import utils from ds_inference import DSInferenceGRPCServer from hf_accelerate import HFAccelerateModel -from utils import DS_INFERENCE, HF_ACCELERATE, get_argument_parser, parse_generate_kwargs, print_rank_n +from utils import CLI, DS_INFERENCE, HF_ACCELERATE, get_argument_parser, parse_generate_kwargs, print_rank_n def get_args() -> argparse.Namespace: parser = get_argument_parser() group = parser.add_argument_group(title="launch config") - group.add_argument( - "--deployment_framework", - type=str, - choices=[ - HF_ACCELERATE, - DS_INFERENCE - ], - default=HF_ACCELERATE - ) group.add_argument("--shutdown_command", required=False, type=str, default="__shutdown__", help="This string will exit the script") - args = utils.get_args(parser) + args = utils.get_args(parser, CLI) return args @@ -43,10 +34,6 @@ def main() -> None: generate_kwargs = args.generate_kwargs while (True): - # currently only 1 process is running so its - # fine but might need to run_rank_n for this - # if running a deployment_framework with - # multiple processes try: input_text = input("Input text: ") diff --git a/scripts/bloom-inference-server/ds_inference/model.py b/scripts/bloom-inference-server/ds_inference/model.py index fdbd6e552..7939fdb41 100644 --- a/scripts/bloom-inference-server/ds_inference/model.py +++ b/scripts/bloom-inference-server/ds_inference/model.py @@ -1,17 +1,15 @@ +import glob import io import json import os import shutil from argparse import Namespace -from pathlib import Path import deepspeed import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from transformers.utils import is_offline_mode -from huggingface_hub import snapshot_download -from utils import Model, print_rank_n, run_rank_n +from utils import Model, get_downloaded_model_path, print_rank_n, run_rank_n class DSInferenceModel(Model): @@ -21,66 +19,41 @@ def __init__(self, args: Namespace) -> None: print_rank_n("Loading model...") world_size = int(os.getenv("WORLD_SIZE", "1")) - self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + downloaded_model_path = get_downloaded_model_path(args.model_name) + + self.tokenizer = AutoTokenizer.from_pretrained(downloaded_model_path) self.pad = self.tokenizer.pad_token_id # Load model with deepspeed.OnDevice(dtype=torch.float16, device="meta"): self.model = AutoModelForCausalLM.from_config( - AutoConfig.from_pretrained(args.model_name), + AutoConfig.from_pretrained(downloaded_model_path), torch_dtype=torch.bfloat16 ) self.model = self.model.eval() if (args.dtype in [torch.float16, torch.int8]): if (args.use_pre_sharded_checkpoints): - model_path = snapshot_download( - args.model_name, - allow_patterns=["*"], - local_files_only=is_offline_mode(), - revision=None - ) checkpoints_json = os.path.join( - model_path, "BLOOM_ds-inference_config.json") + downloaded_model_path, "BLOOM_ds-inference_config.json") self.model = deepspeed.init_inference( self.model, mp_size=world_size, - base_dir=model_path, + base_dir=downloaded_model_path, dtype=args.dtype, checkpoint=checkpoints_json, replace_with_kernel_inject=True ) else: - # Write checkpoints.json - tmp_directory = "tmp" - run_rank_n( - os.makedirs, - { - "name": tmp_directory, - "exist_ok": True - } - ) - checkpoints_json = os.path.join( - tmp_directory, "checkpoints.json") - run_rank_n( - write_checkponts_json, - { - "checkpoints_json": checkpoints_json, - "model_name": args.model_name - }, - barrier=True - ) - - self.model = deepspeed.init_inference( - self.model, - mp_size=world_size, - dtype=args.dtype, - checkpoint=checkpoints_json, - replace_with_kernel_inject=True - ) - - run_rank_n(shutil.rmtree, {"path": tmp_directory}) + with TemporaryCheckpointsJSON(downloaded_model_path) as checkpoints_json: + self.model = deepspeed.init_inference( + self.model, + mp_size=world_size, + dtype=args.dtype, + checkpoint=checkpoints_json, + replace_with_kernel_inject=True + ) elif (args.dtype == torch.bfloat16): raise NotImplementedError("bfloat16 is not yet supported") @@ -90,27 +63,38 @@ def __init__(self, args: Namespace) -> None: print_rank_n("Model loaded") -def get_checkpoint_files(model_name_or_path, revision=None): - # loads files from hub - cached_repo_dir = snapshot_download( - model_name_or_path, - allow_patterns=["*"], - local_files_only=is_offline_mode(), - revision=revision - ) - - # creates a list of paths from all downloaded files in cache dir, matching the regex *.pt - file_list = [str(entry) for entry in Path( - cached_repo_dir).rglob('*.pt') if entry.is_file()] - return file_list - - -def write_checkponts_json(checkpoints_json: str, model_name: str) -> None: - with io.open(checkpoints_json, "w", encoding="utf-8") as f: - checkpoint_files = get_checkpoint_files(model_name) - data = { - "type": "BLOOM", - "checkpoints": checkpoint_files, - "version": 1.0 - } - json.dump(data, f) +class TemporaryCheckpointsJSON: + def __init__(self, model_path: str): + self.tmp_directory = "tmp" + self.tmp_file = os.path.join(self.tmp_directory, "checkpoints.json") + self.model_path = model_path + + def write_checkpoints_json(self, model_path: str) -> None: + with io.open(self.tmp_file, "w", encoding="utf-8") as f: + data = { + "type": "BLOOM", + "checkpoints": glob.glob(f"{model_path}/*.bin"), + "version": 1.0 + } + json.dump(data, f) + + def __enter__(self): + run_rank_n( + os.makedirs, + { + "name": self.tmp_directory, + "exist_ok": True + } + ) + run_rank_n( + self.write_checkpoints_json, + { + "model_path": self.model_path + }, + barrier=True + ) + return self.tmp_file + + def __exit__(self, type, value, traceback): + # run_rank_n(shutil.rmtree, {"path": self.tmp_directory}) + return diff --git a/scripts/bloom-inference-server/ds_zero/model.py b/scripts/bloom-inference-server/ds_zero/model.py index 7106b2f6e..026251544 100644 --- a/scripts/bloom-inference-server/ds_zero/model.py +++ b/scripts/bloom-inference-server/ds_zero/model.py @@ -6,7 +6,7 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.deepspeed import HfDeepSpeedConfig -from utils import Model +from utils import Model, get_downloaded_model_path class DSZeROModel(Model): @@ -14,7 +14,9 @@ def __init__(self, args: Namespace) -> None: if (args.local_rank == 0): print("Loading model...") - config = AutoConfig.from_pretrained(args.model_name) + downloaded_model_path = get_downloaded_model_path(args.model_name) + + config = AutoConfig.from_pretrained(downloaded_model_path) world_size = int(os.getenv('WORLD_SIZE', '1')) train_batch_size = 1 * world_size @@ -49,11 +51,11 @@ def __init__(self, args: Namespace) -> None: # this tells from_pretrained to instantiate directly on gpus dschf = HfDeepSpeedConfig(ds_config) - self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.tokenizer = AutoTokenizer.from_pretrained(downloaded_model_path) self.pad = self.tokenizer.pad_token_id self.model = AutoModelForCausalLM.from_pretrained( - args.model_name, torch_dtype=args.dtype) + downloaded_model_path, torch_dtype=args.dtype) self.model = self.model.eval() self.model = deepspeed.initialize( model=self.model, config_params=ds_config)[0] diff --git a/scripts/bloom-inference-server/hf_accelerate/model.py b/scripts/bloom-inference-server/hf_accelerate/model.py index 1d58fd605..77d27a4af 100644 --- a/scripts/bloom-inference-server/hf_accelerate/model.py +++ b/scripts/bloom-inference-server/hf_accelerate/model.py @@ -3,24 +3,27 @@ import torch from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from utils import Model, print_rank_n +from utils import Model, get_downloaded_model_path, print_rank_n class HFAccelerateModel(Model): def __init__(self, args: Namespace) -> None: print_rank_n("Loading model...") - self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + downloaded_model_path = get_downloaded_model_path(args.model_name) + + self.tokenizer = AutoTokenizer.from_pretrained(downloaded_model_path) self.pad = self.tokenizer.pad_token_id self.model = AutoModelForCausalLM.from_pretrained( - args.model_name, + downloaded_model_path, device_map="auto", max_memory=get_max_memory_per_gpu_dict( args.dtype, args.model_name), torch_dtype=args.dtype ) + self.model.requires_grad_(False) self.model.eval() self.input_device = "cuda:0" diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index aa3d7e72c..3d55eb3b9 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -10,8 +10,7 @@ from utils import ( DS_INFERENCE, HF_ACCELERATE, - ForwardRequest, - ForwardResponse, + SERVER, GenerateRequest, GenerateResponse, TokenizeRequest, @@ -32,15 +31,6 @@ def get_args() -> argparse.Namespace: parser = get_argument_parser() group = parser.add_argument_group(title="launch config") - group.add_argument( - "--deployment_framework", - type=str, - choices=[ - HF_ACCELERATE, - DS_INFERENCE, - ], - default=HF_ACCELERATE - ) group.add_argument("--host", type=str, required=True, help="host address") group.add_argument("--port", type=int, required=True, help="port number") group.add_argument("--workers", type=int, default=1, @@ -50,7 +40,7 @@ def get_args() -> argparse.Namespace: group.add_argument("--debug", action="store_true", help="launch in debug mode") - args = utils.get_args(parser) + args = utils.get_args(parser, SERVER) return args diff --git a/scripts/bloom-inference-server/utils/__init__.py b/scripts/bloom-inference-server/utils/__init__.py index 96f4cf293..5b706d970 100644 --- a/scripts/bloom-inference-server/utils/__init__.py +++ b/scripts/bloom-inference-server/utils/__init__.py @@ -1,5 +1,5 @@ -from .constants import DS_INFERENCE, DS_ZERO, HF_ACCELERATE -from .model import Model +from .constants import BENCHMARK, CLI, DS_INFERENCE, DS_ZERO, HF_ACCELERATE, SERVER +from .model import Model, get_downloaded_model_path from .requests import ( GenerateRequest, GenerateResponse, diff --git a/scripts/bloom-inference-server/utils/constants.py b/scripts/bloom-inference-server/utils/constants.py index e870431f9..50f583f21 100644 --- a/scripts/bloom-inference-server/utils/constants.py +++ b/scripts/bloom-inference-server/utils/constants.py @@ -1,5 +1,6 @@ -from pickle import TRUE - +BENCHMARK = "benchmark" +CLI = "cli" +SERVER = "server" HF_ACCELERATE = "hf_accelerate" DS_INFERENCE = "ds_inference" @@ -13,28 +14,63 @@ FP16 = "fp16" INT8 = "int8" -FRAMEWORK_MODEL_DTYPE_ALLOWED = { - HF_ACCELERATE: { - BIGSCIENCE_BLOOM: { - BF16, - FP16 + +SCRIPT_FRAMEWORK_MODEL_DTYPE_ALLOWED = { + BENCHMARK: { + HF_ACCELERATE: { + BIGSCIENCE_BLOOM: { + BF16, + FP16 + } + }, + DS_INFERENCE: { + BIGSCIENCE_BLOOM: { + FP16 + }, + DS_INFERENCE_BLOOM_FP16: { + FP16 + }, + DS_INFERENCE_BLOOM_INT8: { + INT8 + } + }, + DS_ZERO: { + BIGSCIENCE_BLOOM: { + BF16, + FP16 + } } }, - DS_INFERENCE: { - BIGSCIENCE_BLOOM: { - FP16 - }, - DS_INFERENCE_BLOOM_FP16: { - FP16 + CLI: { + HF_ACCELERATE: { + BIGSCIENCE_BLOOM: { + BF16, + FP16 + } }, - DS_INFERENCE_BLOOM_INT8: { - INT8 + DS_INFERENCE: { + DS_INFERENCE_BLOOM_FP16: { + FP16 + }, + DS_INFERENCE_BLOOM_INT8: { + INT8 + } } }, - DS_ZERO: { - BIGSCIENCE_BLOOM: { - BF16, - FP16 + SERVER: { + HF_ACCELERATE: { + BIGSCIENCE_BLOOM: { + BF16, + FP16 + } + }, + DS_INFERENCE: { + DS_INFERENCE_BLOOM_FP16: { + FP16 + }, + DS_INFERENCE_BLOOM_INT8: { + INT8 + } } } } diff --git a/scripts/bloom-inference-server/utils/model.py b/scripts/bloom-inference-server/utils/model.py index e437d32eb..0d2053462 100644 --- a/scripts/bloom-inference-server/utils/model.py +++ b/scripts/bloom-inference-server/utils/model.py @@ -1,6 +1,10 @@ import argparse +import os import torch +from transformers.utils import is_offline_mode + +from huggingface_hub import snapshot_download from .requests import GenerateRequest, GenerateResponse, TokenizeRequest, TokenizeResponse from .utils import print_rank_n @@ -84,3 +88,12 @@ def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: def shutdown(self) -> None: print_rank_n("shutting down") exit() + + +def get_downloaded_model_path(model_name: str): + return snapshot_download( + model_name, + allow_patterns=["*"], + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None) + ) diff --git a/scripts/bloom-inference-server/utils/utils.py b/scripts/bloom-inference-server/utils/utils.py index 6d71df59d..722df86e4 100644 --- a/scripts/bloom-inference-server/utils/utils.py +++ b/scripts/bloom-inference-server/utils/utils.py @@ -8,13 +8,14 @@ import torch import torch.distributed as dist -from pydantic import BaseModel - from .constants import ( BIGSCIENCE_BLOOM, + DS_INFERENCE, DS_INFERENCE_BLOOM_FP16, DS_INFERENCE_BLOOM_INT8, - FRAMEWORK_MODEL_DTYPE_ALLOWED, + DS_ZERO, + HF_ACCELERATE, + SCRIPT_FRAMEWORK_MODEL_DTYPE_ALLOWED ) @@ -34,6 +35,16 @@ def get_argument_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() group = parser.add_argument_group(title="model") + group.add_argument( + "--deployment_framework", + type=str, + choices=[ + HF_ACCELERATE, + DS_INFERENCE, + DS_ZERO + ], + default=HF_ACCELERATE + ) group.add_argument( "--model_name", type=str, @@ -57,14 +68,15 @@ def get_argument_parser() -> argparse.ArgumentParser: return parser -def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: +def get_args(parser: argparse.ArgumentParser, script: str) -> argparse.Namespace: args = parser.parse_args() - assert is_framework_model_dtype_allowed( + assert is_script_framework_model_dtype_allowed( + script, args.deployment_framework, args.model_name, args.dtype - ), "unsupported deployment_framework, model_name and dtype" + ), f"{script} is not supported with {args.deployment_framework}, {args.model_name} and {args.dtype} dtype" args.dtype = get_torch_dtype(args.dtype) args.generate_kwargs = json.loads(args.generate_kwargs) @@ -171,9 +183,13 @@ def pad_ids(arrays, padding, max_length=-1): return arrays -def is_framework_model_dtype_allowed(deployment_framework: str, model_name: str, dtype: str) -> bool: - if (deployment_framework in FRAMEWORK_MODEL_DTYPE_ALLOWED): - if (model_name in FRAMEWORK_MODEL_DTYPE_ALLOWED[deployment_framework]): - if (dtype in FRAMEWORK_MODEL_DTYPE_ALLOWED[deployment_framework][model_name]): - return True +def is_script_framework_model_dtype_allowed(script: str, + deployment_framework: str, + model_name: str, + dtype: str) -> bool: + if (script in SCRIPT_FRAMEWORK_MODEL_DTYPE_ALLOWED): + if (deployment_framework in SCRIPT_FRAMEWORK_MODEL_DTYPE_ALLOWED[script]): + if (model_name in SCRIPT_FRAMEWORK_MODEL_DTYPE_ALLOWED[script][deployment_framework]): + if (dtype in SCRIPT_FRAMEWORK_MODEL_DTYPE_ALLOWED[script][deployment_framework][model_name]): + return True return False From a85b48868f80c85df6c7bbb51314183f0ef8ea42 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Mon, 29 Aug 2022 10:54:18 +0530 Subject: [PATCH 31/32] make server class --- scripts/bloom-inference-server/server.py | 190 ++++++++++++++--------- 1 file changed, 113 insertions(+), 77 deletions(-) diff --git a/scripts/bloom-inference-server/server.py b/scripts/bloom-inference-server/server.py index 3d55eb3b9..4b155c503 100644 --- a/scripts/bloom-inference-server/server.py +++ b/scripts/bloom-inference-server/server.py @@ -5,6 +5,7 @@ import utils from ds_inference import DSInferenceGRPCServer from fastapi import FastAPI, HTTPException +from fastapi.routing import APIRoute from hf_accelerate import HFAccelerateModel from pydantic import BaseModel from utils import ( @@ -45,93 +46,128 @@ def get_args() -> argparse.Namespace: return args -#################################################################################### -args = get_args() -app = FastAPI() - -if (args.deployment_framework == HF_ACCELERATE): - model = HFAccelerateModel(args) -elif (args.deployment_framework == DS_INFERENCE): - model = DSInferenceGRPCServer(args) -else: - raise ValueError( - f"Unknown deployment framework {args.deployment_framework}") - -query_ids = QueryID() -#################################################################################### - - -def get_exception_response(query_id: int, method: str): - e_type, e_message, e_stack_trace = sys.exc_info() - response = { - "error": str(e_type.__name__), - "message": str(e_message), - "query_id": query_id, - "method": method - } - - if (args.debug): - trace_back = traceback.extract_tb(e_stack_trace) - - # Format stacktrace - stack_trace = [] - for trace in trace_back: - stack_trace.append("File : {}, Line : {}, Func.Name : {}, Message : {}".format( - trace[0], trace[1], trace[2], trace[3])) - - response["stack_trace"] = stack_trace - - return response +class Server: + def __init__(self, args: argparse.Namespace): + self.host = args.host + self.port = args.port + self.workers = args.workers + self.debug = args.debug + + self.allowed_max_new_tokens = args.allowed_max_new_tokens + self.query_ids = QueryID() + + if (args.deployment_framework == HF_ACCELERATE): + self.model = HFAccelerateModel(args) + elif (args.deployment_framework == DS_INFERENCE): + self.model = DSInferenceGRPCServer(args) + else: + raise ValueError( + f"Unknown deployment framework {args.deployment_framework}") + + self.app = FastAPI( + routes=[ + APIRoute( + "/generate/", + self.generate, + methods=["POST"], + ), + APIRoute( + "/tokenize/", + self.tokenize, + methods=["POST"], + ), + APIRoute( + "/query_id/", + self.query_id, + methods=["GET"], + ) + ], + timeout=600, + ) + def get_exception_response(self, query_id: int, method: str): + e_type, e_message, e_stack_trace = sys.exc_info() + response = { + "error": str(e_type.__name__), + "message": str(e_message), + "query_id": query_id, + "method": method + } -@app.post("/generate/") -def generate(request: GenerateRequest) -> GenerateResponse: - try: - request.max_new_tokens = get_num_tokens_to_generate( - request.max_new_tokens, args.allowed_max_new_tokens) + if (self.debug): + trace_back = traceback.extract_tb(e_stack_trace) - response, total_time_taken = run_and_log_time( - (model.generate, {"request": request}) - ) + # Format stacktrace + stack_trace = [] + for trace in trace_back: + stack_trace.append("File : {}, Line : {}, Func.Name : {}, Message : {}".format( + trace[0], trace[1], trace[2], trace[3])) - response.query_id = query_ids.generate_query_id - query_ids.generate_query_id += 1 - response.total_time_taken = "{:.2f} secs".format(total_time_taken) + response["stack_trace"] = stack_trace return response - except Exception: - response = get_exception_response( - query_ids.generate_query_id, request.method) - query_ids.generate_query_id += 1 - raise HTTPException(500, response) - -@app.post("/tokenize/") -def tokenize(request: TokenizeRequest) -> TokenizeResponse: - try: - response, total_time_taken = run_and_log_time( - (model.tokenize, {"request": request}) + def generate(self, request: GenerateRequest) -> GenerateResponse: + try: + request.max_new_tokens = get_num_tokens_to_generate( + request.max_new_tokens, self.allowed_max_new_tokens) + + response, total_time_taken = run_and_log_time( + (self.model.generate, {"request": request}) + ) + + response.query_id = self.query_ids.generate_query_id + self.query_ids.generate_query_id += 1 + response.total_time_taken = "{:.2f} secs".format(total_time_taken) + + return response + except Exception: + response = self.get_exception_response( + self.query_ids.generate_query_id, request.method) + self.query_ids.generate_query_id += 1 + raise HTTPException(500, response) + + def tokenize(self, request: TokenizeRequest) -> TokenizeResponse: + try: + response, total_time_taken = run_and_log_time( + (self.model.tokenize, {"request": request}) + ) + + response.query_id = self.query_ids.tokenize_query_id + self.query_ids.tokenize_query_id += 1 + response.total_time_taken = "{:.2f} msecs".format( + total_time_taken * 1000) + + return response + except Exception: + response = self.get_exception_response( + self.query_ids.tokenize_query_id, request.method) + self.query_ids.tokenize_query_id += 1 + raise HTTPException(500, response) + + def query_id(self) -> QueryID: + return self.query_ids + + def run(self): + run( + self.app, + host=self.host, + port=self.port, + workers=self.workers ) - response.query_id = query_ids.tokenize_query_id - query_ids.tokenize_query_id += 1 - response.total_time_taken = "{:.2f} msecs".format( - total_time_taken * 1000) - - return response - except Exception: - response = get_exception_response( - query_ids.tokenize_query_id, request.method) - query_ids.tokenize_query_id += 1 - raise HTTPException(500, response) + def shutdown(self): + self.model.shutdown() -@app.get("/query_id/") -def query_id() -> QueryID: - return query_ids +def main() -> None: + args = get_args() + server = Server(args) + try: + server.run() + except KeyboardInterrupt: + server.shutdown() -try: - run(app, host=args.host, port=args.port, workers=args.workers) -except KeyboardInterrupt: - model.shutdown() +if (__name__ == "__main__"): + main() From 43a844cbedbc4c8ac827afd698dd8a6be19b10e2 Mon Sep 17 00:00:00 2001 From: Mayank Mishra Date: Thu, 1 Sep 2022 17:08:09 +0530 Subject: [PATCH 32/32] fix snapshot download --- scripts/bloom-inference-server/benchmark.py | 3 +++ .../ds_inference/model.py | 8 ++----- .../bloom-inference-server/ds_zero/model.py | 7 +++--- scripts/bloom-inference-server/utils/model.py | 22 ++++++++++++++----- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/scripts/bloom-inference-server/benchmark.py b/scripts/bloom-inference-server/benchmark.py index 729ff58e7..23c519a09 100644 --- a/scripts/bloom-inference-server/benchmark.py +++ b/scripts/bloom-inference-server/benchmark.py @@ -2,6 +2,7 @@ import gc import os +import deepspeed import torch import utils @@ -144,8 +145,10 @@ def main() -> None: if (args.deployment_framework == HF_ACCELERATE): benchmark_end_to_end(args, HFAccelerateModel) elif (args.deployment_framework == DS_INFERENCE): + deepspeed.init_distributed("nccl") benchmark_end_to_end(args, DSInferenceModel) elif (args.deployment_framework == DS_ZERO): + deepspeed.init_distributed("nccl") benchmark_end_to_end(args, DSZeROModel, zero_activated=True) else: raise ValueError( diff --git a/scripts/bloom-inference-server/ds_inference/model.py b/scripts/bloom-inference-server/ds_inference/model.py index 7939fdb41..c42aa5250 100644 --- a/scripts/bloom-inference-server/ds_inference/model.py +++ b/scripts/bloom-inference-server/ds_inference/model.py @@ -2,7 +2,6 @@ import io import json import os -import shutil from argparse import Namespace import deepspeed @@ -14,9 +13,7 @@ class DSInferenceModel(Model): def __init__(self, args: Namespace) -> None: - if (args.local_rank == 0): - # print_rank_n won't work here since deepspeed is not initialized yet - print_rank_n("Loading model...") + print_rank_n("Loading model...") world_size = int(os.getenv("WORLD_SIZE", "1")) downloaded_model_path = get_downloaded_model_path(args.model_name) @@ -35,7 +32,7 @@ def __init__(self, args: Namespace) -> None: if (args.dtype in [torch.float16, torch.int8]): if (args.use_pre_sharded_checkpoints): checkpoints_json = os.path.join( - downloaded_model_path, "BLOOM_ds-inference_config.json") + downloaded_model_path, "ds_inference_config.json") self.model = deepspeed.init_inference( self.model, @@ -96,5 +93,4 @@ def __enter__(self): return self.tmp_file def __exit__(self, type, value, traceback): - # run_rank_n(shutil.rmtree, {"path": self.tmp_directory}) return diff --git a/scripts/bloom-inference-server/ds_zero/model.py b/scripts/bloom-inference-server/ds_zero/model.py index 026251544..8bb0d7cfc 100644 --- a/scripts/bloom-inference-server/ds_zero/model.py +++ b/scripts/bloom-inference-server/ds_zero/model.py @@ -6,13 +6,12 @@ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.deepspeed import HfDeepSpeedConfig -from utils import Model, get_downloaded_model_path +from utils import Model, get_downloaded_model_path, print_rank_n class DSZeROModel(Model): def __init__(self, args: Namespace) -> None: - if (args.local_rank == 0): - print("Loading model...") + print_rank_n("Loading model...") downloaded_model_path = get_downloaded_model_path(args.model_name) @@ -63,3 +62,5 @@ def __init__(self, args: Namespace) -> None: self.model = self.model.module self.input_device = torch.cuda.current_device() + + print_rank_n("Model loaded") diff --git a/scripts/bloom-inference-server/utils/model.py b/scripts/bloom-inference-server/utils/model.py index 0d2053462..267256a9b 100644 --- a/scripts/bloom-inference-server/utils/model.py +++ b/scripts/bloom-inference-server/utils/model.py @@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download from .requests import GenerateRequest, GenerateResponse, TokenizeRequest, TokenizeResponse -from .utils import print_rank_n +from .utils import print_rank_n, run_rank_n class Model: @@ -91,9 +91,19 @@ def shutdown(self) -> None: def get_downloaded_model_path(model_name: str): - return snapshot_download( - model_name, - allow_patterns=["*"], - local_files_only=is_offline_mode(), - cache_dir=os.getenv("TRANSFORMERS_CACHE", None) + kwargs = { + "repo_id": model_name, + "allow_patterns": ["*"], + "local_files_only": is_offline_mode(), + "cache_dir": os.getenv("TRANSFORMERS_CACHE", None) + } + # download only on 1 process + run_rank_n( + snapshot_download, + kwargs, + barrier=True ) + # now since the snapshot is downloaded, pass the + # model_path to all processes + model_path = snapshot_download(**kwargs) + return model_path