From de3dc609ee11ec0c97047a18608afcfced6215e8 Mon Sep 17 00:00:00 2001 From: yb-peng <75617475+pengyb2001@users.noreply.github.com> Date: Tue, 20 Feb 2024 18:55:43 +0800 Subject: [PATCH] Modify harness evaluation workflow (#10174) * Modify table head in harness * Specify the file path of fp16.csv * change run to run nightly and run pr to debug * Modify the way to get fp16.csv to downloading from github * Change the method to calculate diff in html table * Change the method to calculate diff in html table * Re-arrange job order * Re-arrange job order * Change limit * Change fp16.csv path * Change highlight rules * Change limit --- .github/workflows/llm-harness-evaluation.yml | 36 +++++++++++++++++-- .../benchmark/harness/harness_csv_to_html.py | 27 ++++++++------ 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/.github/workflows/llm-harness-evaluation.yml b/.github/workflows/llm-harness-evaluation.yml index f01c08ea59b..fa8493f4b27 100644 --- a/.github/workflows/llm-harness-evaluation.yml +++ b/.github/workflows/llm-harness-evaluation.yml @@ -166,7 +166,29 @@ jobs: fi - - name: Run harness + - name: Run harness nightly + if: ${{github.event_name == 'schedule'}} + shell: bash + working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness + env: + USE_XETLA: OFF + # SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS: 1 + run: | + export HF_HOME=${HARNESS_HF_HOME} + export HF_DATASETS=$HARNESS_HF_HOME/datasets + export HF_DATASETS_CACHE=$HARNESS_HF_HOME/datasets + source /opt/intel/oneapi/setvars.sh + + python run_llb.py \ + --model bigdl-llm \ + --pretrained ${MODEL_PATH} \ + --precision ${{ matrix.precision }} \ + --device ${{ matrix.device }} \ + --tasks ${{ matrix.task }} \ + --batch_size 1 --no_cache --output_path results \ + + - name: Run harness pr + if: ${{github.event_name == 'pull_request'}} shell: bash working-directory: ${{ github.workspace }}/python/llm/dev/benchmark/harness env: @@ -185,6 +207,7 @@ jobs: --device ${{ matrix.device }} \ --tasks ${{ matrix.task }} \ --batch_size 1 --no_cache --output_path results \ + --limit 3 \ - uses: actions/upload-artifact@v3 with: @@ -226,7 +249,7 @@ jobs: python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py results # TODO: change machine to store the results later - llm-harness-summary-nightly: + llm-harness-summary-html: if: ${{github.event_name == 'schedule' || github.event_name == 'pull_request'}} needs: [set-matrix, llm-harness-evaluation] runs-on: ["self-hosted", "llm", "accuracy1", "accuracy-nightly"] @@ -267,6 +290,13 @@ jobs: name: harness_results path: ${{ env.PR_FOLDER}}/${{ env.OUTPUT_PATH }} + # Save fp16.csv in the parent folder of env.nightly_folder + - name: Download fp16.csv for summary + shell: bash + run: | + wget https://raw.githubusercontent.com/intel-analytics/BigDL/main/python/llm/dev/benchmark/harness/fp16.csv -O ${{ env.NIGHTLY_FOLDER}}/../fp16.csv + ls ${{ env.NIGHTLY_FOLDER}}/.. + - name: Summarize the results for nightly run if: github.event_name == 'schedule' shell: bash @@ -275,7 +305,7 @@ jobs: pip install pandas==1.5.3 python ${{ github.workspace }}/python/llm/dev/benchmark/harness/make_table_and_csv.py ${{ env.NIGHTLY_FOLDER}}/${{ env.OUTPUT_PATH }} ${{ env.NIGHTLY_FOLDER}} python ${{ github.workspace }}/python/llm/dev/benchmark/harness/harness_csv_to_html.py -f ${{ env.NIGHTLY_FOLDER}} - + - name: Summarize the results for pull request if: github.event_name == 'pull_request' shell: bash diff --git a/python/llm/dev/benchmark/harness/harness_csv_to_html.py b/python/llm/dev/benchmark/harness/harness_csv_to_html.py index 7680f2b5854..1e592fcd931 100644 --- a/python/llm/dev/benchmark/harness/harness_csv_to_html.py +++ b/python/llm/dev/benchmark/harness/harness_csv_to_html.py @@ -21,12 +21,14 @@ import argparse import pandas as pd -def highlight_vals(val, max=3.0, color1='red', color2='green'): +def highlight_vals(val, max=3.0, color1='red', color2='green', color3='yellow'): if isinstance(val, float): if val > max: return 'background-color: %s' % color2 elif val <= -max: return 'background-color: %s' % color1 + elif val != 0.0: + return 'background-color: %s' % color3 else: return '' @@ -80,7 +82,10 @@ def main(): help="the baseline path which stores the baseline.csv file") args = parser.parse_args() - fp16_dict = create_fp16_dict('fp16.csv') + # fp16.csv is downloaded previously under the parent folder of the folder_path + parent_dir = os.path.dirname((args.folder_path)) + fp16_path = os.path.join(parent_dir, 'fp16.csv') + fp16_dict = create_fp16_dict(fp16_path) csv_files = [] for file_name in os.listdir(args.folder_path): @@ -157,11 +162,11 @@ def main(): previous_winogrande=previous_csv_row[Winogrande] if previous_arc > 0.0 and previous_truthfulqa > 0.0 and previous_winogrande > 0.0: last_Arc[latest_csv_ind]=previous_arc - diff_Arc[latest_csv_ind]=round((previous_arc-latest_arc)*100/previous_arc,2) + diff_Arc[latest_csv_ind]=round((latest_arc-previous_arc)*100/previous_arc,2) last_TruthfulQA[latest_csv_ind]=previous_truthfulqa - diff_TruthfulQA[latest_csv_ind]=round((previous_truthfulqa-latest_truthfulqa)*100/previous_truthfulqa,2) + diff_TruthfulQA[latest_csv_ind]=round((latest_truthfulqa-previous_truthfulqa)*100/previous_truthfulqa,2) last_Winogrande[latest_csv_ind]=previous_winogrande - diff_Winogrande[latest_csv_ind]=round((previous_winogrande-latest_winogrande)*100/previous_winogrande,2) + diff_Winogrande[latest_csv_ind]=round((latest_winogrande-previous_winogrande)*100/previous_winogrande,2) in_previous_flag=True if not in_previous_flag: @@ -172,12 +177,12 @@ def main(): last_Winogrande[latest_csv_ind]=pd.NA diff_Winogrande[latest_csv_ind]=pd.NA - latest_csv.insert(loc=5,column='last_Arc',value=last_Arc) - latest_csv.insert(loc=6,column='diff_Arc(%)',value=diff_Arc) - latest_csv.insert(loc=7,column='last_TruthfulQA',value=last_TruthfulQA) - latest_csv.insert(loc=8,column='diff_TruthfulQA(%)',value=diff_TruthfulQA) - latest_csv.insert(loc=9,column='last_Winogrande',value=last_Winogrande) - latest_csv.insert(loc=10,column='diff_Winogrande(%)',value=diff_Winogrande) + latest_csv.insert(loc=6,column='last_Arc',value=last_Arc) + latest_csv.insert(loc=7,column='diff_Arc(%)',value=diff_Arc) + latest_csv.insert(loc=8,column='last_TruthfulQA',value=last_TruthfulQA) + latest_csv.insert(loc=9,column='diff_TruthfulQA(%)',value=diff_TruthfulQA) + latest_csv.insert(loc=10,column='last_Winogrande',value=last_Winogrande) + latest_csv.insert(loc=11,column='diff_Winogrande(%)',value=diff_Winogrande) diffs_within_normal_range = is_diffs_within_normal_range(diff_Arc, diff_TruthfulQA, diff_Winogrande, threshold=highlight_threshold)