diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml new file mode 100644 index 0000000000000..80a8c522bc5a0 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml @@ -0,0 +1,16 @@ +# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF +# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" +tasks: +- name: "gsm8k_cot_llama" + metrics: + - name: "exact_match,strict-match" + value: 0.8317 + - name: "exact_match,flexible-extract" + value: 0.8355 +limit: null +num_fewshot: 8 +dtype: "bfloat16" +fewshot_as_multiturn: true +apply_chat_template: true +fp8: true \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-fp8.txt b/.jenkins/lm-eval-harness/configs/models-fp8.txt new file mode 100644 index 0000000000000..8a318a9ec936d --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-fp8.txt @@ -0,0 +1 @@ +Meta-Llama-3.1-8B-Instruct-fp8.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index da19c7a078629..3df0621f49a72 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -27,6 +27,14 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) +def setup_fp8(model_path, device_type): + flavor = f"g{device_type[-1]}" + normalized_model_name = Path(model_path).parts[-1].lower() + os.environ[ + "QUANT_CONFIG"] = \ + f"/software/data/vllm-benchmarks/inc/{normalized_model_name}/maxabs_quant_{flavor}.json" + + def fail_on_exit(): os._exit(1) @@ -42,10 +50,10 @@ def launch_lm_eval(eval_config): f"max_model_len=4096," \ f"max_num_seqs={max_num_seqs}," \ f"trust_remote_code={trust_remote_code}" - if eval_config.get("num_scheduler_steps"): - model_args += \ - f",num_scheduler_steps={eval_config.get('num_scheduler_steps')}" - print(f"MODEL_ARGS: {model_args}") + if eval_config.get("fp8"): + model_args += ",quantization=inc," \ + "kv_cache_dtype=fp8_inc," \ + "weights_load_device=cpu" kwargs = {} if 'fewshot_as_multiturn' in eval_config: kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn'] @@ -138,6 +146,9 @@ def test_lm_eval_correctness(record_xml_attribute, record_property): f'tp{TP_SIZE}') record_xml_attribute("name", testname) + # Set up environment for FP8 inference + if eval_config.get("fp8"): + setup_fp8(eval_config["model_name"], platform) # Launch eval requests. start_time = time.perf_counter() results = launch_lm_eval(eval_config) diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index f90cdb354d4f5..b32563d6222e9 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -22,3 +22,8 @@ stages: - name: gsm8k_large_g2_tp4 flavor: g2.m command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4 + - name: test_gsm8k_fp8 + steps: + - name: gsm8k_small_g3_tp1_fp8 + flavor: g3 + command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-fp8.txt -t 1