test intel-analytics#1

WeiguangHan · Nov 17, 2023 · fb59e20 · fb59e20
1 parent e0be197
commit fb59e20
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 43 deletions.
diff --git a/.github/workflows/llm_performance_tests.yml b/.github/workflows/llm_performance_tests.yml
@@ -9,23 +9,23 @@ concurrency:
 on:
   schedule:
     - cron: "00 13 * * *" # GMT time, 13:00 GMT == 21:00 China
-  # pull_request:
-  #   branches: [main]
-  #   paths:
-  #     - ".github/workflows/llm_performance_tests.yml"
-  #     - "python/llm/test/benchmark/**"
-  #     - "python/llm/dev/benchmark/all-in-one/**"
+  pull_request:
+    branches: [main]
+    paths:
+      - ".github/workflows/llm_performance_tests.yml"
+      - "python/llm/test/benchmark/**"
+      - "python/llm/dev/benchmark/all-in-one/**"
   workflow_dispatch:
   workflow_call:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   llm-cpp-build:
-    if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-cpp-build' || github.event.inputs.artifact == 'all' }}
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-cpp-build' || github.event.inputs.artifact == 'all' }}
     uses: ./.github/workflows/llm-binary-build.yml
 
   llm-performance-test-on-arc:
-    if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }}
+    # if: ${{ github.event.schedule || github.event.inputs.artifact == 'llm-performance-test-on-arc' || github.event.inputs.artifact == 'all' }}
     needs: llm-cpp-build
     strategy:
       fail-fast: false

diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py
@@ -359,41 +359,41 @@ def run_transformer_int4_gpu(repo_id,
     result = {}
     with torch.inference_mode():
         for in_out in in_out_pairs:
-            try:
-                in_out_len = in_out.split("-")
-                in_len = int(in_out_len[0])
-                out_len = int(in_out_len[1])
-                # As different tokenizer has different encodings,
-                # in_len.txt maybe shorter than we need,
-                # use much longer context to make sure input length
-                test_length = min(in_len*2, 8192)
-                while test_length not in [32, 256, 1024, 2048, 8192]:
-                    test_length = test_length * 2
-                input_str = open(f"prompt/{test_length}.txt", 'r').read()
-                # As different tokenizer has different encodings,
-                # slice the input_ids to ensure the prompt length is required length.
-                input_ids = tokenizer.encode(input_str, return_tensors="pt")
-                input_ids = input_ids[:, :in_len]
-                true_str = tokenizer.batch_decode(input_ids)[0]
-                input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
-                actual_in_len = input_ids.shape[1]
-                result[in_out] = []
-                for i in range(num_trials + warm_up):
-                    st = time.perf_counter()
-                    output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
-                                                num_beams=num_beams)
-                    torch.xpu.synchronize()
-                    end = time.perf_counter()
-                    output_ids = output_ids.cpu()
-                    print("model generate cost: " + str(end - st))
-                    output = tokenizer.batch_decode(output_ids)
-                    print(output[0])
-                    actual_out_len = output_ids.shape[1] - actual_in_len
-                    if i >= warm_up:
-                        result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
-                                            actual_in_len, actual_out_len])
-            except RuntimeError:
-                pass
+            # try:
+            in_out_len = in_out.split("-")
+            in_len = int(in_out_len[0])
+            out_len = int(in_out_len[1])
+            # As different tokenizer has different encodings,
+            # in_len.txt maybe shorter than we need,
+            # use much longer context to make sure input length
+            test_length = min(in_len*2, 8192)
+            while test_length not in [32, 256, 1024, 2048, 8192]:
+                test_length = test_length * 2
+            input_str = open(f"prompt/{test_length}.txt", 'r').read()
+            # As different tokenizer has different encodings,
+            # slice the input_ids to ensure the prompt length is required length.
+            input_ids = tokenizer.encode(input_str, return_tensors="pt")
+            input_ids = input_ids[:, :in_len]
+            true_str = tokenizer.batch_decode(input_ids)[0]
+            input_ids = tokenizer.encode(true_str, return_tensors="pt").to('xpu')
+            actual_in_len = input_ids.shape[1]
+            result[in_out] = []
+            for i in range(num_trials + warm_up):
+                st = time.perf_counter()
+                output_ids = model.generate(input_ids, do_sample=False, max_new_tokens=out_len,
+                                            num_beams=num_beams)
+                torch.xpu.synchronize()
+                end = time.perf_counter()
+                output_ids = output_ids.cpu()
+                print("model generate cost: " + str(end - st))
+                output = tokenizer.batch_decode(output_ids)
+                print(output[0])
+                actual_out_len = output_ids.shape[1] - actual_in_len
+                if i >= warm_up:
+                    result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time,
+                                        actual_in_len, actual_out_len])
+            # except RuntimeError:
+            #     pass
     torch.xpu.empty_cache()
     return result