Skip to content

Commit

Permalink
HF accelerate model
Browse files Browse the repository at this point in the history
  • Loading branch information
Mayank Mishra authored and Mayank Mishra committed Aug 12, 2022
1 parent 47f3fc0 commit 435af43
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 93 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import gc
from typing import List, Union

import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
Expand All @@ -7,7 +8,6 @@
from utils import (
Execute,
benchmark_generation,
generate,
get_argument_parser,
get_benchmark_results,
get_dummy_batch,
Expand All @@ -16,6 +16,59 @@
)


class HFAccelerateModel:
def __init__(self, model_name: str, dtype: torch.dtype) -> None:
print_rank_n("Loading model...")

self.tokenizer = AutoTokenizer.from_pretrained(model_name)

self.model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
max_memory=get_max_memory_per_gpu_dict(
dtype, model_name),
torch_dtype=dtype
)

self.model.eval()
self.input_device = "cuda:0"

print_rank_n("Model loaded")

def generate(self,
text: Union[str, List[str]],
generate_kwargs: dict,
remove_input_from_output: bool = False) -> Union[str, List[str]]:
if (type(text) == str):
text = [text]

input_tokens = self.tokenizer(text, return_tensors="pt", padding=True)

for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(self.input_device)

with torch.no_grad():
output_tokens = self.model.generate(
**input_tokens,
**generate_kwargs
)

input_token_lengths = [x.shape[0] for x in input_tokens.input_ids]
output_token_lengths = [x.shape[0] for x in output_tokens]
generated_tokens = [
o - i for i, o in zip(input_token_lengths, output_token_lengths)]

if (remove_input_from_output):
output_tokens = [x[-i:]
for x, i in zip(output_tokens, generated_tokens)]

output_text = self.tokenizer.batch_decode(
output_tokens, skip_special_tokens=True)

return output_text, generated_tokens


def get_args():
parser = get_argument_parser()

Expand Down Expand Up @@ -50,80 +103,74 @@ def get_max_memory_per_gpu_dict(dtype, model_name):
# from https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing
model_params = l*(12*h**2 + 13*h) + v*h + 4*h
except:
print(f"The model {model_name} has a broken config file. Please notify the owner")
print_rank_n(
f"The model {model_name} has a broken config file. Please notify the owner")
raise

bytes = torch.finfo(dtype).bits / 8
param_memory_total_in_bytes = model_params * bytes
# add 5% since weight sizes aren't the same and some GPU may need more memory
param_memory_per_gpu_in_bytes = int(param_memory_total_in_bytes / n_gpus * 1.05)
print(f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights")
param_memory_per_gpu_in_bytes = int(
param_memory_total_in_bytes / n_gpus * 1.05)
print_rank_n(
f"Estimating {param_memory_per_gpu_in_bytes/2**30:0.2f}GB per gpu for weights")

# check the real available memory
# load cuda kernels first and only measure the real free memory after loading (shorter by ~2GB)
torch.ones(1).cuda()
max_memory_per_gpu_in_bytes = torch.cuda.mem_get_info(0)[0]
if max_memory_per_gpu_in_bytes < param_memory_per_gpu_in_bytes:
raise ValueError(f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)")
raise ValueError(
f"Unable to generate the memory map automatically as the needed estimated memory per gpu ({param_memory_per_gpu_in_bytes/2**30:0.2f}GB) is bigger than the available per gpu memory ({max_memory_per_gpu_in_bytes/2**30:0.2f}GB)")

return {i: param_memory_per_gpu_in_bytes for i in range(torch.cuda.device_count())}


def main():
args = get_args()
print_rank_n(f"Loading model {args.model_name}")

(tokenizer, model), initialization_time = run_and_log_time(
[
Execute(
AutoTokenizer.from_pretrained,
{
"pretrained_model_name_or_path": args.model_name,
}
),
Execute(
AutoModelForCausalLM.from_pretrained,
{
"pretrained_model_name_or_path": args.model_name,
"device_map": "auto",
"max_memory": get_max_memory_per_gpu_dict(args.dtype, args.model_name),
"torch_dtype": args.dtype
}
)
]
model, initialization_time = run_and_log_time(
Execute(
HFAccelerateModel,
{
"model_name": args.model_name,
"dtype": args.dtype,
}
)
)

if (args.generate_kwargs):
generate_kwargs = args.generate_kwargs
else:
generate_kwargs = {
"max_new_tokens": 100,
"do_sample": False
}

print_rank_n(
f"*** Starting to generate {args.max_new_tokens} tokens with bs={args.batch_size}")
f"*** Starting to generate {generate_kwargs['max_new_tokens']} tokens with bs={args.batch_size}")

input_sentences = get_dummy_batch(args.batch_size)
generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)

print_rank_n(f"Generate args {generate_kwargs}")

# warmup is a must if measuring speed as it's when all the optimizations are performed
# e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
_ = generate(
model.generate(
input_sentences,
model,
tokenizer,
generate_kwargs,
"cuda:0"
generate_kwargs
)

generated, generation_time = run_and_log_time(
(output_text, num_generated_tokens), generation_time = run_and_log_time(
Execute(
generate,
model.generate,
{
"inputs": input_sentences,
"model": model,
"tokenizer": tokenizer,
"generate_kwargs": generate_kwargs,
"input_device": "cuda:0"
"text": input_sentences,
"generate_kwargs": generate_kwargs
}
)
)
for i, (o, _) in zip(input_sentences, generated):
for i, (o, _) in zip(input_sentences, zip(output_text, num_generated_tokens)):
print_rank_n(f"{'-' * 60}\nin = {i}\nout = {o}\n")

if (args.benchmark_cycles > 0):
Expand All @@ -133,13 +180,7 @@ def main():
gc.collect()

# warm up
_ = generate(
input_sentences,
model,
tokenizer,
generate_kwargs,
"cuda:0"
)
model.generate(input_sentences, generate_kwargs)
torch.cuda.synchronize()

# benchmark
Expand All @@ -149,9 +190,7 @@ def main():
{
"input_sentences": input_sentences,
"model": model,
"tokenizer": tokenizer,
"generate_kwargs": generate_kwargs,
"input_device": "cuda:0",
"generate_kwargs": generate_kwargs
}
)
)
Expand All @@ -165,5 +204,6 @@ def main():
)
)


if (__name__ == "__main__"):
main()
50 changes: 6 additions & 44 deletions scripts/inference/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,8 @@ def get_argument_parser():
group.add_argument("--dtype", type=str, required=True,
choices=["bf16", "fp16"], help="dtype for model")
group.add_argument("--batch_size", default=1, type=int, help="batch size")

group = parser.add_argument_group(title="default values")
group.add_argument("--greedy", action="store_true")
group.add_argument("--top_k", type=int, default=0, help="default top_k")
group.add_argument("--top_p", type=float, default=0, help="default top_p")
group.add_argument("--temperature", type=float,
default=1, help="default temperature")
group.add_argument("--min_length", type=int, default=1, help="min length")
group.add_argument("--max_new_tokens", type=int,
default=100, help="max new tokens")
group.add_argument("--generate_kwargs", type=dict, default={},
help="generate parameters. look at https://huggingface.co/docs/transformers/v4.21.1/en/main_classes/text_generation#transformers.generation_utils.GenerationMixin.generate to see the supported parameters")

return parser

Expand Down Expand Up @@ -108,31 +100,6 @@ def get_dummy_batch(batch_size: int, input_sentences: List[str] = None) -> List[
return input_sentences


def generate(inputs: List[str],
model: AutoModelForCausalLM,
tokenizer: AutoTokenizer,
generate_kwargs: dict,
input_device) -> Tuple[List[str], List[int]]:
""" returns a list of zipped outputs and number of new tokens """

input_tokens = tokenizer(
inputs, return_tensors="pt", padding=True)
for t in input_tokens:
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(input_device)

outputs = model.generate(**input_tokens, **generate_kwargs)

input_tokens_lengths = [x.shape[0] for x in input_tokens.input_ids]
output_tokens_lengths = [x.shape[0] for x in outputs]

total_new_tokens = [o-i for i,
o in zip(input_tokens_lengths, output_tokens_lengths)]
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

return zip(outputs, total_new_tokens)


def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], float]:
"""
runs a list of Execute objects and returns a list of outputs and the time taken
Expand All @@ -152,21 +119,16 @@ def run_and_log_time(execs: Union[List[Execute], Execute]) -> Union[List[Any], f

def benchmark_generation(input_sentences,
model,
tokenizer,
generate_kwargs,
input_device,
cycles: int = 5):
total_new_tokens_generated = 0
for _ in range(cycles):
generated = generate(
_, num_generated_tokens = model.generate(
input_sentences,
model,
tokenizer,
generate_kwargs,
input_device
generate_kwargs
)
total_new_tokens_generated += sum(new_tokens for _,
new_tokens in generated)
total_new_tokens_generated += sum(
new_tokens for new_tokens in num_generated_tokens)
return total_new_tokens_generated


Expand Down

0 comments on commit 435af43

Please sign in to comment.