Add fp8 test to jenkins CI (#429)

HabanaAI · Oct 30, 2024 · 049d9dc · 049d9dc
1 parent 2df2f1f
commit 049d9dc
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 4 deletions.
diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml
@@ -0,0 +1,16 @@
+# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF
+# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1
+model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct"
+tasks:
+- name: "gsm8k_cot_llama"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.8317
+  - name: "exact_match,flexible-extract"
+    value: 0.8355
+limit: null
+num_fewshot: 8
+dtype: "bfloat16"
+fewshot_as_multiturn: true
+apply_chat_template: true
+fp8: true
diff --git a/.jenkins/lm-eval-harness/configs/models-fp8.txt b/.jenkins/lm-eval-harness/configs/models-fp8.txt
@@ -0,0 +1 @@
+Meta-Llama-3.1-8B-Instruct-fp8.yaml
diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -27,6 +27,14 @@
 TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 
 
+def setup_fp8(model_path, device_type):
+    flavor = f"g{device_type[-1]}"
+    normalized_model_name = Path(model_path).parts[-1].lower()
+    os.environ[
+        "QUANT_CONFIG"] = \
+            f"/software/data/vllm-benchmarks/inc/{normalized_model_name}/maxabs_quant_{flavor}.json"
+
+
 def fail_on_exit():
     os._exit(1)
 
@@ -42,10 +50,10 @@ def launch_lm_eval(eval_config):
                  f"max_model_len=4096," \
                  f"max_num_seqs={max_num_seqs}," \
                  f"trust_remote_code={trust_remote_code}"
-    if eval_config.get("num_scheduler_steps"):
-        model_args += \
-            f",num_scheduler_steps={eval_config.get('num_scheduler_steps')}"
-        print(f"MODEL_ARGS: {model_args}")
+    if eval_config.get("fp8"):
+        model_args += ",quantization=inc," \
+            "kv_cache_dtype=fp8_inc," \
+            "weights_load_device=cpu"
     kwargs = {}
     if 'fewshot_as_multiturn' in eval_config:
         kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn']
@@ -138,6 +146,9 @@ def test_lm_eval_correctness(record_xml_attribute, record_property):
                     f'tp{TP_SIZE}')
         record_xml_attribute("name", testname)
 
+        # Set up environment for FP8 inference
+        if eval_config.get("fp8"):
+            setup_fp8(eval_config["model_name"], platform)
         # Launch eval requests.
         start_time = time.perf_counter()
         results = launch_lm_eval(eval_config)

diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml
@@ -22,3 +22,8 @@ stages:
       - name: gsm8k_large_g2_tp4
         flavor: g2.m
         command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4
+  - name: test_gsm8k_fp8
+    steps:
+      - name: gsm8k_small_g3_tp1_fp8
+        flavor: g3
+        command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-fp8.txt -t 1