intel-analytics · Oscilloscope98 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -75,16 +75,9 @@ def run_model_in_thread(model, in_out, tokenizer, result, warm_up, num_beams, in
                                         actual_in_len, actual_out_len, load_time, model.peak_memory])
 
 
-def get_continuation_input_str(in_len):
-    # in_len.txt maybe shorter than we need,
-    # use much longer context to make sure input length
-    test_length = min(in_len*2, 8192)
-    while test_length not in [32, 256, 1024, 2048, 8192] and test_length < 8192:
-        test_length = test_length * 2
-    # Force the test_length to be 8192, such that we can use 8192.txt
-    if test_length > 8192:
-        test_length = 8192
-    return open(f"prompt/continuation/{test_length}.txt", 'r').read()
+def get_continuation_input_str():
+    # all use 8192.txt for prompt preparation for now
+    return open(f"prompt/continuation/8192.txt", 'r').read()
 
 
 def preprocess_prompt(tokenizer, in_len, task):
@@ -231,7 +224,7 @@ def run_native_int4(repo_id,
         in_out_len = in_out.split("-")
         in_len = int(in_out_len[0])
         out_len = int(in_out_len[1])
-        input_str = get_continuation_input_str(in_len)
+        input_str = get_continuation_input_str()
         # As different tokenizer has different encodings,
         # slice the input_ids to ensure the prompt length is required length.
         n_ctx = in_len + out_len if in_len + out_len > 512 else 512
@@ -287,7 +280,7 @@ def run_transformer_int4(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -346,7 +339,7 @@ def run_pytorch_autocast_bf16(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -411,7 +404,7 @@ def run_optimize_model(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -520,7 +513,7 @@ def run_transformer_int4_gpu(repo_id,
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
             if task == 'continuation':
-                input_str = get_continuation_input_str(in_len)
+                input_str = get_continuation_input_str()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -602,7 +595,7 @@ def transformers_int4_npu_win(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -673,7 +666,7 @@ def run_optimize_model_gpu(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -738,7 +731,7 @@ def run_ipex_fp16_gpu(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -809,7 +802,7 @@ def run_bigdl_fp16_gpu(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -895,7 +888,7 @@ def run_deepspeed_transformer_int4_cpu(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -988,7 +981,7 @@ def run_transformer_int4_gpu_win(repo_id,
                 in_out_len = in_out.split("-")
                 in_len = int(in_out_len[0])
                 out_len = int(in_out_len[1])
-                input_str = get_continuation_input_str(in_len)
+                input_str = get_continuation_input_str()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1105,7 +1098,7 @@ def run_transformer_int4_fp16_gpu_win(repo_id,
                 in_out_len = in_out.split("-")
                 in_len = int(in_out_len[0])
                 out_len = int(in_out_len[1])
-                input_str = get_continuation_input_str(in_len)
+                input_str = get_continuation_input_str()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1215,7 +1208,7 @@ def run_transformer_int4_loadlowbit_gpu_win(repo_id,
                 in_out_len = in_out.split("-")
                 in_len = int(in_out_len[0])
                 out_len = int(in_out_len[1])
-                input_str = get_continuation_input_str(in_len)
+                input_str = get_continuation_input_str()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1325,7 +1318,7 @@ def run_transformer_int4_fp16_loadlowbit_gpu_win(repo_id,
                 in_out_len = in_out.split("-")
                 in_len = int(in_out_len[0])
                 out_len = int(in_out_len[1])
-                input_str = get_continuation_input_str(in_len)
+                input_str = get_continuation_input_str()
                 # As different tokenizer has different encodings,
                 # slice the input_ids to ensure the prompt length is required length.
                 input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1408,7 +1401,7 @@ def run_transformer_autocast_bf16( repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1473,7 +1466,7 @@ def run_bigdl_ipex_bf16(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1537,7 +1530,7 @@ def run_bigdl_ipex_int4(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1601,7 +1594,7 @@ def run_bigdl_ipex_int8(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1705,7 +1698,7 @@ def get_int_from_env(env_keys, default):
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1776,7 +1769,7 @@ def run_speculative_cpu(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1848,7 +1841,7 @@ def run_speculative_gpu(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")
@@ -1929,7 +1922,7 @@ def run_pipeline_parallel_gpu(repo_id,
             in_out_len = in_out.split("-")
             in_len = int(in_out_len[0])
             out_len = int(in_out_len[1])
-            input_str = get_continuation_input_str(in_len)
+            input_str = get_continuation_input_str()
             # As different tokenizer has different encodings,
             # slice the input_ids to ensure the prompt length is required length.
             input_ids = tokenizer.encode(input_str, return_tensors="pt")