Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Nov 19, 2024
1 parent 7ac52d7 commit 35050d3
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 29 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/llm_bench-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ jobs:
run: |
wget -O ./ov_models/soulcard.safetensors https://civitai.com/api/download/models/72591
python ./tools/llm_bench/benchmark.py -m ./ov_models/dreamlike-art-dreamlike-anime-1.0/FP16/ -pf ./tools/llm_bench/prompts/stable-diffusion.jsonl -d cpu -n 1 --genai --lora ./ov_models/soulcard.safetensors --lora_alphas 0.7
- name: Test dolly-v2-7b and dolly-v2-3b in Speculative Deconding mode on Linux
run: |
optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format fp16 ov_models/TinyLlama-1.1B-Chat-v1.0/FP16
optimum-cli export openvino --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --trust-remote-code --weight-format int8 ov_models/TinyLlama-1.1B-Chat-v1.0/INT8
python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --assistant_confidence_threshold 0.4
python ./tools/llm_bench/benchmark.py -m ./ov_models/TinyLlama-1.1B-Chat-v1.0/FP16/ --draft_model ./ov_models/TinyLlama-1.1B-Chat-v1.0/INT8/ -p "Why is the Sun yellow?" -d cpu --draft_device cpu -n 1 --genai --num_assistant_tokens 5
- name: Test whisper-tiny on Linux
run: |
GIT_LFS_SKIP_SMUDGE=1 git clone --depth 1 --branch main --single-branch https://huggingface.co/datasets/facebook/multilingual_librispeech
Expand Down
14 changes: 8 additions & 6 deletions tools/llm_bench/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,14 @@ def get_argprser():
parser.add_argument('--lora_alphas', nargs='*', help='Alphas params for LoRA adapters.', required=False, default=[])
parser.add_argument("--use_cb", action="store_true", help="Use Continuous Batching inference mode")
parser.add_argument("--cb_config", required=False, default=None, help="Path to file with Continuous Batching Scheduler settings or dict")
parser.add_argument("--draft_model", required=False, default=None,
help="Path to draft model folder including IR files for Speculative decoding generation.")
parser.add_argument("--draft_device", required=False, default='cpu', help="Inference device for Speculative decoding generation.")
parser.add_argument("--num_assistant_tokens", required=False, default=5, help="Config option num_assistant_tokens for Speculative decoding")
parser.add_argument("--assistant_confidence_threshold", required=False, default=0,
help="Config option assistant_confidence_threshold for Speculative decodin")
parser.add_argument("--draft_model", required=False, default=None,
help="Path to draft model folder including IR files for Speculative decoding generation")
parser.add_argument("--draft_device", required=False, default=None, help="Inference device for Speculative decoding of draft model")
parser.add_argument("--draft_cb_config", required=False, default=None,
help="Path to file with Continuous Batching Scheduler settings or dict for Speculative decoding of draft model")
parser.add_argument("--num_assistant_tokens", required=False, default=None, help="Config option num_assistant_tokens for Speculative decoding")
parser.add_argument("--assistant_confidence_threshold", required=False, default=None,
help="Config option assistant_confidence_threshold for Speculative decoding")
parser.add_argument(
'--end_token_stopping',
action='store_true',
Expand Down
2 changes: 0 additions & 2 deletions tools/llm_bench/llm_bench_utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,6 @@ def analyze_args(args):
model_args["cb_config"] = cb_config
model_args['draft_model'] = args.draft_model
model_args['draft_device'] = args.draft_device
if (args.num_assistant_tokens > 0 and args.assistant_confidence_threshold > 0):
raise RuntimeError("Parameters `assistant_confidence_threshold` and `num_assistant_tokens` are mutually exclusive")
model_args['num_assistant_tokens'] = args.num_assistant_tokens
model_args['assistant_confidence_threshold'] = args.assistant_confidence_threshold
return model_path, model_framework, model_args, model_name
Expand Down
30 changes: 19 additions & 11 deletions tools/llm_bench/llm_bench_utils/ov_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,21 @@ def create_text_gen_model(model_path, device, **kwargs):
return ov_model, tokenizer, from_pretrained_time, bench_hook, False


def get_scheduler_config_genai(user_config, **kwargs):
import openvino_genai

default_cb_config = {"cache_size": 1}
scheduler_config = openvino_genai.SchedulerConfig()
scheduler_params = kwargs.get(user_config) or default_cb_config
if scheduler_params:
log.info(f"Scheduler parameters for {user_config}:\n{scheduler_params}")

for param, value in scheduler_params.items():
setattr(scheduler_config, param, value)

return scheduler_config


def create_genai_text_gen_model(model_path, device, ov_config, **kwargs):
import openvino_tokenizers # noqa: F401
import openvino_genai
Expand All @@ -218,22 +233,15 @@ def create_genai_text_gen_model(model_path, device, ov_config, **kwargs):
cb = kwargs.get("use_cb", False)
if cb or draft_model_path:
log.info("Continuous Batching mode activated")
default_cb_config = {"cache_size": 1}
scheduler_config = openvino_genai.SchedulerConfig()
scheduler_params = kwargs.get("cb_config") or default_cb_config
if scheduler_params:
log.info(f"Scheduler parameters:\n{scheduler_params}")

for param, value in scheduler_params.items():
setattr(scheduler_config, param, value)
ov_config["scheduler_config"] = scheduler_config
ov_config["scheduler_config"] = get_scheduler_config_genai("cb_config", **kwargs)

if draft_model_path:
if not Path(draft_model_path).exists():
raise RuntimeError(f'==Failure ==: draft model by path:{draft_model_path} is not exists')
log.info("Speculative Decoding is activated")

ov_config['draft_model'] = openvino_genai.draft_model(draft_model_path, kwargs['draft_device'].upper())
draft_device = kwargs.get('draft_device', None) or device
ov_config['draft_model'] = openvino_genai.draft_model(draft_model_path, draft_device.upper(),
scheduler_config=get_scheduler_config_genai("draft_cb_config", **kwargs))

adapter_config = get_lora_config(kwargs.get("lora", None), kwargs.get("lora_alphas", []))
if adapter_config:
Expand Down
26 changes: 16 additions & 10 deletions tools/llm_bench/task/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,11 +199,14 @@ def run_text_generation_genai(input_text, num, model, tokenizer, args, iter_data
gen_config.num_beams = args["num_beams"]
gen_config.do_sample = False
if args.get('draft_model', ''):
gen_config.num_assistant_tokens = args['num_assistant_tokens']
gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold']
log.info("Speculative decoding config: ")
log.info(f" num_assistant_tokens {gen_config.num_assistant_tokens}")
log.info(f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}")
config_info = "Speculative decoding config: "
if args.get('num_assistant_tokens', None):
gen_config.num_assistant_tokens = args['num_assistant_tokens']
config_info += f" num_assistant_tokens {gen_config.num_assistant_tokens}"
if args.get('assistant_confidence_threshold', None):
gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold']
config_info += f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}"
log.info(config_info)
start = time.perf_counter()
generation_result = model.generate(input_text_list, gen_config)
end = time.perf_counter()
Expand Down Expand Up @@ -328,11 +331,14 @@ def run_text_generation_genai_with_stream(input_text, num, model, tokenizer, arg
gen_config.num_beams = args["num_beams"]
gen_config.do_sample = False
if args.get('draft_model', ''):
gen_config.num_assistant_tokens = args['num_assistant_tokens']
gen_config.assistant_confidence_threshold = args['assistant_confidence_threshold']
log.info("Speculative decoding config: ")
log.info(f" num_assistant_tokens {gen_config.num_assistant_tokens}")
log.info(f" assistant_confidence_threshold {gen_config.assistant_confidence_threshold}")
config_info = "Speculative decoding config: "
if args.get("num_assistant_tokens", None):
gen_config.num_assistant_tokens = int(args["num_assistant_tokens"])
config_info += f'num_assistant_tokens {args["num_assistant_tokens"]}'
if args.get("assistant_confidence_threshold", None):
gen_config.assistant_confidence_threshold = float(args["assistant_confidence_threshold"])
config_info += f'assistant_confidence_threshold {args["assistant_confidence_threshold"]}'
log.info(config_info)
start = time.perf_counter()
generated_tokens = model.generate(input_data, gen_config, streamer=streamer).tokens
end = time.perf_counter()
Expand Down

0 comments on commit 35050d3

Please sign in to comment.