From c5fb1314a0ab7bb976082afd6f52c77945844041 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Sun, 17 Nov 2024 22:43:48 +0000 Subject: [PATCH 1/2] Add speculative decoding params to lm_bench --- tools/llm_bench/benchmark.py | 6 +++++ .../llm_bench/llm_bench_utils/model_utils.py | 8 +++++- tools/llm_bench/llm_bench_utils/ov_utils.py | 12 +++++++-- tools/llm_bench/task/text_generation.py | 25 ++++++++++++++++--- 4 files changed, 45 insertions(+), 6 deletions(-) diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index 2176373370..d0c71fec21 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -140,6 +140,12 @@ def get_argprser(): parser.add_argument('--lora_alphas', nargs='*', help='Alphas params for LoRA adapters.', required=False, default=[]) parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode") parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict") + parser.add_argument("--draft_model", required=False, default=None, + help="Path to draft model folder including IR files for Speculative decoding generation.") + parser.add_argument("--draft_device", required=False, default='cpu', help="Inference device for Speculative decoding generation.") + parser.add_argument("--num_assistant_tokens", required=False, default=5, help="Config option num_assistant_tokens for Speculative decoding") + parser.add_argument("--assistant_confidence_threshold", required=False, default=0, + help="Config option assistant_confidence_threshold for Speculative decodin") parser.add_argument( '--end_token_stopping', action='store_true', diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 64313e13ab..151d77a643 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -135,12 +135,18 @@ def analyze_args(args): model_args['model_type'] = get_model_type(model_name, use_case, model_framework) model_args['model_name'] = model_name - if args.use_cb and not args.genai: + if (args.use_cb or args.draft_model) and not args.genai: raise RuntimeError("Continuous batching mode supported only via OpenVINO GenAI") cb_config = None if args.cb_config: cb_config = get_config(args.cb_config) model_args["cb_config"] = cb_config + model_args['draft_model'] = args.draft_model + model_args['draft_device'] = args.draft_device + if (args.num_assistant_tokens > 0 and args.assistant_confidence_threshold > 0): + raise RuntimeError("Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive") + model_args['num_assistant_tokens'] = args.num_assistant_tokens + model_args['assistant_confidence_threshold'] = args.assistant_confidence_threshold return model_path, model_framework, model_args, model_name diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index 9399978514..60d04c56eb 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -214,8 +214,9 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + draft_model_path = kwargs.get("draft_model", '') cb = kwargs.get("use_cb", False) - if cb: + if cb or draft_model_path: log.info("Continuous Batching mode activated") default_cb_config = {"cache_size": 1} scheduler_config = openvino_genai.SchedulerConfig() @@ -227,6 +228,13 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): setattr(scheduler_config, param, value) ov_config["scheduler_config"] = scheduler_config + if draft_model_path: + if not Path(draft_model_path).exists(): + raise RuntimeError(f'==Failure ==: draft model by path:{draft_model_path} is not exists') + log.info("Speculative Decoding is activated") + + ov_config['draft_model'] = openvino_genai.draft_model(draft_model_path, kwargs['draft_device'].upper()) + adapter_config = get_lora_config(kwargs.get("lora", None), kwargs.get("lora_alphas", [])) if adapter_config: ov_config['adapters'] = adapter_config @@ -263,7 +271,7 @@ def get_tokens(self): def get_time_list(self): return self.token_generation_time - streamer = TokenStreamer(llm_pipe.get_tokenizer()) if cb else None + streamer = TokenStreamer(llm_pipe.get_tokenizer()) if cb or draft_model_path else None return llm_pipe, tokenizer, end - start, streamer, True diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index d936721344..1ff0417214 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -194,12 +194,21 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data if args['infer_count'] is not None: out_str += 'all max_output_token_size: {} * {}'.format(args['infer_count'], args['batch_size']) log.info(out_str) + gen_config = model.get_generation_config() + gen_config.max_new_tokens = max_gen_tokens + gen_config.num_beams = args["num_beams"] + gen_config.do_sample = False + if args.get('draft_model', ''): + gen_config.num_assistant_tokens = args['num_assistant_tokens'] + gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold'] + log.info("Speculative decoding config: ") + log.info(f" num_assistant_tokens {gen_config.num_assistant_tokens}") + log.info(f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}") start = time.perf_counter() - generation_result = model.generate(input_text_list, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], do_sample=False) + generation_result = model.generate(input_text_list, gen_config) end = time.perf_counter() generated_text = generation_result.texts perf_metrics = generation_result.perf_metrics - if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() max_rss_mem_consumption, max_shared_mem_consumption, max_uss_mem_consumption = mem_consumption.get_max_memory_consumption() @@ -314,8 +323,18 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg mem_consumption.start_collect_memory_consumption() max_gen_tokens = DEFAULT_OUTPUT_TOKEN_SIZE if args['infer_count'] is None else args['infer_count'] streamer.reset() + gen_config = model.get_generation_config() + gen_config.max_new_tokens = max_gen_tokens + gen_config.num_beams = args["num_beams"] + gen_config.do_sample = False + if args.get('draft_model', ''): + gen_config.num_assistant_tokens = args['num_assistant_tokens'] + gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold'] + log.info("Speculative decoding config: ") + log.info(f" num_assistant_tokens {gen_config.num_assistant_tokens}") + log.info(f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}") start = time.perf_counter() - generated_tokens = model.generate(input_data, max_new_tokens=max_gen_tokens, num_beams=args["num_beams"], streamer=streamer, do_sample=False).tokens + generated_tokens = model.generate(input_data, gen_config, streamer=streamer).tokens end = time.perf_counter() if (args['mem_consumption'] == 1 and num == 0) or args['mem_consumption'] == 2: mem_consumption.end_collect_momory_consumption() From e4155c38aeb8e23d0ba416e5aed488f9ac93545c Mon Sep 17 00:00:00 2001 From: sbalandi Date: Mon, 18 Nov 2024 21:49:07 +0000 Subject: [PATCH 2/2] update --- .github/workflows/llm_bench-python.yml | 6 ++++ tools/llm_bench/benchmark.py | 14 +++++---- .../llm_bench/llm_bench_utils/model_utils.py | 6 ++-- tools/llm_bench/llm_bench_utils/ov_utils.py | 31 ++++++++++++------- tools/llm_bench/task/text_generation.py | 26 ++++++++++------ 5 files changed, 54 insertions(+), 29 deletions(-) diff --git a/.github/workflows/llm_bench-python.yml b/.github/workflows/llm_bench-python.yml index f835ad3415..0ac47d1aa0 100644 --- a/.github/workflows/llm_bench-python.yml +++ b/.github/workflows/llm_bench-python.yml @@ -82,6 +82,12 @@ jobs: run: | wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591 python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7 + - name: Test TinyLlama-1.1B-Chat-v1.0 in Speculative Deconding mode on Linux + run: | + optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16 + optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8 + python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --assistant_confidence_threshold 0.4 + python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --num_assistant_tokens 5 - name: Test whisper-tiny on Linux run: | GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech diff --git a/tools/llm_bench/benchmark.py b/tools/llm_bench/benchmark.py index d0c71fec21..d652c8b48f 100644 --- a/tools/llm_bench/benchmark.py +++ b/tools/llm_bench/benchmark.py @@ -140,12 +140,14 @@ def get_argprser(): parser.add_argument('--lora_alphas', nargs='*', help='Alphas params for LoRA adapters.', required=False, default=[]) parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode") parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict") - parser.add_argument("--draft_model", required=False, default=None, - help="Path to draft model folder including IR files for Speculative decoding generation.") - parser.add_argument("--draft_device", required=False, default='cpu', help="Inference device for Speculative decoding generation.") - parser.add_argument("--num_assistant_tokens", required=False, default=5, help="Config option num_assistant_tokens for Speculative decoding") - parser.add_argument("--assistant_confidence_threshold", required=False, default=0, - help="Config option assistant_confidence_threshold for Speculative decodin") + parser.add_argument("--draft_model", required=False, default=None, + help="Path to draft model folder including IR files for Speculative decoding generation") + parser.add_argument("--draft_device", required=False, default=None, help="Inference device for Speculative decoding of draft model") + parser.add_argument("--draft_cb_config", required=False, default=None, + help="Path to file with Continuous Batching Scheduler settings or dict for Speculative decoding of draft model") + parser.add_argument("--num_assistant_tokens", required=False, default=None, help="Config option num_assistant_tokens for Speculative decoding") + parser.add_argument("--assistant_confidence_threshold", required=False, default=None, + help="Config option assistant_confidence_threshold for Speculative decoding") parser.add_argument( '--end_token_stopping', action='store_true', diff --git a/tools/llm_bench/llm_bench_utils/model_utils.py b/tools/llm_bench/llm_bench_utils/model_utils.py index 151d77a643..6539bef232 100644 --- a/tools/llm_bench/llm_bench_utils/model_utils.py +++ b/tools/llm_bench/llm_bench_utils/model_utils.py @@ -143,8 +143,10 @@ def analyze_args(args): model_args["cb_config"] = cb_config model_args['draft_model'] = args.draft_model model_args['draft_device'] = args.draft_device - if (args.num_assistant_tokens > 0 and args.assistant_confidence_threshold > 0): - raise RuntimeError("Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive") + draft_cb_config = None + if args.draft_cb_config: + draft_cb_config = get_config(args.draft_cb_config) + model_args["draft_cb_config"] = draft_cb_config model_args['num_assistant_tokens'] = args.num_assistant_tokens model_args['assistant_confidence_threshold'] = args.assistant_confidence_threshold return model_path, model_framework, model_args, model_name diff --git a/tools/llm_bench/llm_bench_utils/ov_utils.py b/tools/llm_bench/llm_bench_utils/ov_utils.py index 60d04c56eb..cf0d0d831c 100644 --- a/tools/llm_bench/llm_bench_utils/ov_utils.py +++ b/tools/llm_bench/llm_bench_utils/ov_utils.py @@ -204,6 +204,21 @@ def create_text_gen_model(model_path, device, **kwargs): return ov_model, tokenizer, from_pretrained_time, bench_hook, False +def get_scheduler_config_genai(user_config, config_name="CB config"): + import openvino_genai + + default_cb_config = {"cache_size": 1} + scheduler_config = openvino_genai.SchedulerConfig() + scheduler_params = user_config or default_cb_config + if scheduler_params: + log.info(f"Scheduler parameters for {config_name}:\n{scheduler_params}") + + for param, value in scheduler_params.items(): + setattr(scheduler_config, param, value) + + return scheduler_config + + def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): import openvino_tokenizers # noqa: F401 import openvino_genai @@ -218,22 +233,16 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs): cb = kwargs.get("use_cb", False) if cb or draft_model_path: log.info("Continuous Batching mode activated") - default_cb_config = {"cache_size": 1} - scheduler_config = openvino_genai.SchedulerConfig() - scheduler_params = kwargs.get("cb_config") or default_cb_config - if scheduler_params: - log.info(f"Scheduler parameters:\n{scheduler_params}") - - for param, value in scheduler_params.items(): - setattr(scheduler_config, param, value) - ov_config["scheduler_config"] = scheduler_config + ov_config["scheduler_config"] = get_scheduler_config_genai(kwargs.get("cb_config")) if draft_model_path: if not Path(draft_model_path).exists(): raise RuntimeError(f'==Failure ==: draft model by path:{draft_model_path} is not exists') log.info("Speculative Decoding is activated") - - ov_config['draft_model'] = openvino_genai.draft_model(draft_model_path, kwargs['draft_device'].upper()) + draft_device = kwargs.get('draft_device', None) or device + draft_model_load_kwargs = {'scheduler_config': get_scheduler_config_genai(kwargs.get("draft_cb_config"), "draft CB config")}\ + if kwargs.get("draft_cb_config") is not None else {} + ov_config['draft_model'] = openvino_genai.draft_model(draft_model_path, draft_device.upper(), **draft_model_load_kwargs) adapter_config = get_lora_config(kwargs.get("lora", None), kwargs.get("lora_alphas", [])) if adapter_config: diff --git a/tools/llm_bench/task/text_generation.py b/tools/llm_bench/task/text_generation.py index 1ff0417214..029bcdf16d 100644 --- a/tools/llm_bench/task/text_generation.py +++ b/tools/llm_bench/task/text_generation.py @@ -199,11 +199,14 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data gen_config.num_beams = args["num_beams"] gen_config.do_sample = False if args.get('draft_model', ''): - gen_config.num_assistant_tokens = args['num_assistant_tokens'] - gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold'] - log.info("Speculative decoding config: ") - log.info(f" num_assistant_tokens {gen_config.num_assistant_tokens}") - log.info(f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}") + config_info = "Speculative decoding config: " + if args.get('num_assistant_tokens', None): + gen_config.num_assistant_tokens = args['num_assistant_tokens'] + config_info += f" num_assistant_tokens {gen_config.num_assistant_tokens}" + if args.get('assistant_confidence_threshold', None): + gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold'] + config_info += f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}" + log.info(config_info) start = time.perf_counter() generation_result = model.generate(input_text_list, gen_config) end = time.perf_counter() @@ -328,11 +331,14 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg gen_config.num_beams = args["num_beams"] gen_config.do_sample = False if args.get('draft_model', ''): - gen_config.num_assistant_tokens = args['num_assistant_tokens'] - gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold'] - log.info("Speculative decoding config: ") - log.info(f" num_assistant_tokens {gen_config.num_assistant_tokens}") - log.info(f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}") + config_info = "Speculative decoding config: " + if args.get("num_assistant_tokens", None): + gen_config.num_assistant_tokens = int(args["num_assistant_tokens"]) + config_info += f'num_assistant_tokens {args["num_assistant_tokens"]}' + if args.get("assistant_confidence_threshold", None): + gen_config.assistant_confidence_threshold = float(args["assistant_confidence_threshold"]) + config_info += f'assistant_confidence_threshold {args["assistant_confidence_threshold"]}' + log.info(config_info) start = time.perf_counter() generated_tokens = model.generate(input_data, gen_config, streamer=streamer).tokens end = time.perf_counter()