From de3dc609ee11ec0c97047a18608afcfced6215e8 Mon Sep 17 00:00:00 2001
From: yb-peng <75617475+pengyb2001@users.noreply.github.com>
Date: Tue, 20 Feb 2024 18:55:43 +0800
Subject: [PATCH] Modify harness evaluation workflow (#10174)

* Modify table head in harness

* Specify the file path of fp16.csv

* change run to run nightly and run pr to debug

* Modify the way to get fp16.csv to downloading from github

* Change the method to calculate diff in html table

* Change the method to calculate diff in html table

* Re-arrange job order

* Re-arrange job order

* Change limit

* Change fp16.csv  path

* Change highlight rules

* Change limit
---
 .github/workflows/llm-harness-evaluation.yml  | 36 +++++++++++++++++--
 .../benchmark/harness/harness_csv_to_html.py  | 27 ++++++++------
 2 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml
index f01c08ea59b..fa8493f4b27 100644
--- a/.github/workflows/llm-harness-evaluation.yml
+++ b/.github/workflows/llm-harness-evaluation.yml
@@ -166,7 +166,29 @@ jobs:
           fi
       
 
-      - name: Run harness
+      - name: Run harness nightly
+        if: ${{github.event_name == 'schedule'}}
+        shell: bash
+        working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
+        env:
+          USE_XETLA: OFF
+          # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1
+        run: |
+          export HF_HOME=${HARNESS_HF_HOME}
+          export HF_DATASETS=$HARNESS_HF_HOME/datasets
+          export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets
+          source /opt/intel/oneapi/setvars.sh
+          
+          python run_llb.py \
+            --model bigdl-llm \
+            --pretrained ${MODEL_PATH} \
+            --precision ${{ matrix.precision }} \
+            --device ${{ matrix.device }} \
+            --tasks ${{ matrix.task }} \
+            --batch_size 1 --no_cache --output_path results \
+      
+      - name: Run harness pr
+        if: ${{github.event_name == 'pull_request'}}
         shell: bash
         working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness
         env:
@@ -185,6 +207,7 @@ jobs:
             --device ${{ matrix.device }} \
             --tasks ${{ matrix.task }} \
             --batch_size 1 --no_cache --output_path results \
+            --limit 3 \
 
       - uses: actions/upload-artifact@v3
         with:
@@ -226,7 +249,7 @@ jobs:
           python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py results
 
   # TODO: change machine to store the results later        
-  llm-harness-summary-nightly:
+  llm-harness-summary-html:
     if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}}
     needs: [set-matrix, llm-harness-evaluation]
     runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"]
@@ -267,6 +290,13 @@ jobs:
           name: harness_results
           path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }}
 
+      # Save fp16.csv in the parent folder of env.nightly_folder
+      - name: Download fp16.csv for summary
+        shell: bash
+        run: |
+          wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/dev/benchmark/harness/fp16.csv -O ${{ env.NIGHTLY_FOLDER}}/../fp16.csv
+          ls ${{ env.NIGHTLY_FOLDER}}/..
+
       - name: Summarize the results for nightly run
         if: github.event_name == 'schedule'
         shell: bash
@@ -275,7 +305,7 @@ jobs:
           pip install pandas==1.5.3
           python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.NIGHTLY_FOLDER}}
           python ${{ github.workspace }}/python/llm/dev/benchmark/harness/harness_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}}
-        
+
       - name: Summarize the results for pull request
         if: github.event_name == 'pull_request'
         shell: bash
diff --git a/python/llm/dev/benchmark/harness/harness_csv_to_html.py b/python/llm/dev/benchmark/harness/harness_csv_to_html.py
index 7680f2b5854..1e592fcd931 100644
--- a/python/llm/dev/benchmark/harness/harness_csv_to_html.py
+++ b/python/llm/dev/benchmark/harness/harness_csv_to_html.py
@@ -21,12 +21,14 @@
 import argparse
 import pandas as pd
 
-def highlight_vals(val, max=3.0, color1='red', color2='green'):
+def highlight_vals(val, max=3.0, color1='red', color2='green', color3='yellow'):
     if isinstance(val, float):
         if val > max:
             return 'background-color: %s' % color2
         elif val <= -max:
             return 'background-color: %s' % color1
+        elif val != 0.0:
+            return 'background-color: %s' % color3
     else:
         return ''
 
@@ -80,7 +82,10 @@ def main():
                         help="the baseline path which stores the baseline.csv file")
     args = parser.parse_args()
 
-    fp16_dict = create_fp16_dict('fp16.csv')
+    # fp16.csv is downloaded previously under the parent folder of the folder_path
+    parent_dir = os.path.dirname((args.folder_path))
+    fp16_path = os.path.join(parent_dir, 'fp16.csv')
+    fp16_dict = create_fp16_dict(fp16_path)
 
     csv_files = []
     for file_name in os.listdir(args.folder_path):
@@ -157,11 +162,11 @@ def main():
                     previous_winogrande=previous_csv_row[Winogrande]
                     if previous_arc > 0.0 and previous_truthfulqa > 0.0 and previous_winogrande > 0.0:
                         last_Arc[latest_csv_ind]=previous_arc
-                        diff_Arc[latest_csv_ind]=round((previous_arc-latest_arc)*100/previous_arc,2)
+                        diff_Arc[latest_csv_ind]=round((latest_arc-previous_arc)*100/previous_arc,2)
                         last_TruthfulQA[latest_csv_ind]=previous_truthfulqa
-                        diff_TruthfulQA[latest_csv_ind]=round((previous_truthfulqa-latest_truthfulqa)*100/previous_truthfulqa,2)
+                        diff_TruthfulQA[latest_csv_ind]=round((latest_truthfulqa-previous_truthfulqa)*100/previous_truthfulqa,2)
                         last_Winogrande[latest_csv_ind]=previous_winogrande
-                        diff_Winogrande[latest_csv_ind]=round((previous_winogrande-latest_winogrande)*100/previous_winogrande,2)
+                        diff_Winogrande[latest_csv_ind]=round((latest_winogrande-previous_winogrande)*100/previous_winogrande,2)
                         in_previous_flag=True
 
             if not in_previous_flag:
@@ -172,12 +177,12 @@ def main():
                 last_Winogrande[latest_csv_ind]=pd.NA
                 diff_Winogrande[latest_csv_ind]=pd.NA
 
-        latest_csv.insert(loc=5,column='last_Arc',value=last_Arc)
-        latest_csv.insert(loc=6,column='diff_Arc(%)',value=diff_Arc)
-        latest_csv.insert(loc=7,column='last_TruthfulQA',value=last_TruthfulQA)
-        latest_csv.insert(loc=8,column='diff_TruthfulQA(%)',value=diff_TruthfulQA)
-        latest_csv.insert(loc=9,column='last_Winogrande',value=last_Winogrande)
-        latest_csv.insert(loc=10,column='diff_Winogrande(%)',value=diff_Winogrande)
+        latest_csv.insert(loc=6,column='last_Arc',value=last_Arc)
+        latest_csv.insert(loc=7,column='diff_Arc(%)',value=diff_Arc)
+        latest_csv.insert(loc=8,column='last_TruthfulQA',value=last_TruthfulQA)
+        latest_csv.insert(loc=9,column='diff_TruthfulQA(%)',value=diff_TruthfulQA)
+        latest_csv.insert(loc=10,column='last_Winogrande',value=last_Winogrande)
+        latest_csv.insert(loc=11,column='diff_Winogrande(%)',value=diff_Winogrande)
 
 
         diffs_within_normal_range = is_diffs_within_normal_range(diff_Arc, diff_TruthfulQA, diff_Winogrande, threshold=highlight_threshold)