Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update all-in-one benchmark for continuation task input preparation #11760

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 25 additions & 32 deletions python/llm/dev/benchmark/all-in-one/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,9 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
actual_in_len, actual_out_len, load_time, model.peak_memory])


def get_continuation_input_str(in_len):
# in_len.txt maybe shorter than we need,
# use much longer context to make sure input length
test_length = min(in_len*2, 8192)
while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
test_length = test_length * 2
# Force the test_length to be 8192, such that we can use 8192.txt
if test_length > 8192:
test_length = 8192
return open(f"prompt/continuation/{test_length}.txt", 'r').read()
def get_continuation_input_str():
# all use 8192.txt for prompt preparation for now; and keep 'utf-8' as character encoding mode
return open(f"prompt/continuation/8192.txt", 'r', encoding='utf-8').read()


def preprocess_prompt(tokenizer, in_len, task):
Expand Down Expand Up @@ -231,7 +224,7 @@ def run_native_int4(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
n_ctx = in_len + out_len if in_len + out_len > 512 else 512
Expand Down Expand Up @@ -287,7 +280,7 @@ def run_transformer_int4(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -346,7 +339,7 @@ def run_pytorch_autocast_bf16(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -411,7 +404,7 @@ def run_optimize_model(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -520,7 +513,7 @@ def run_transformer_int4_gpu(repo_id,
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
if task == 'continuation':
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -602,7 +595,7 @@ def transformers_int4_npu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -673,7 +666,7 @@ def run_optimize_model_gpu(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -738,7 +731,7 @@ def run_ipex_fp16_gpu(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -809,7 +802,7 @@ def run_bigdl_fp16_gpu(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -895,7 +888,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -988,7 +981,7 @@ def run_transformer_int4_gpu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1105,7 +1098,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1215,7 +1208,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1325,7 +1318,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1408,7 +1401,7 @@ def run_transformer_autocast_bf16( repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1473,7 +1466,7 @@ def run_bigdl_ipex_bf16(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1537,7 +1530,7 @@ def run_bigdl_ipex_int4(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1601,7 +1594,7 @@ def run_bigdl_ipex_int8(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1705,7 +1698,7 @@ def get_int_from_env(env_keys, default):
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1776,7 +1769,7 @@ def run_speculative_cpu(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1848,7 +1841,7 @@ def run_speculative_gpu(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down Expand Up @@ -1929,7 +1922,7 @@ def run_pipeline_parallel_gpu(repo_id,
in_out_len = in_out.split("-")
in_len = int(in_out_len[0])
out_len = int(in_out_len[1])
input_str = get_continuation_input_str(in_len)
input_str = get_continuation_input_str()
# As different tokenizer has different encodings,
# slice the input_ids to ensure the prompt length is required length.
input_ids = tokenizer.encode(input_str, return_tensors="pt")
Expand Down
Loading