FP8 example (#3062)

On top of #3049 ### Changes - Added FP8 example. ### Reason for changes - Examples coverage. ### Related tickets - 155923 ### Tests - ubuntu test_examples 627 - passed - windows test-examples 288 - passed - GA Test examples 135 - passed --------- Co-authored-by: Alexander Kozlov <[email protected]>
openvinotoolkit · Nov 27, 2024 · 2db9fb9 · 2db9fb9
1 parent f61aa89
commit 2db9fb9
Show file tree

Hide file tree

Showing 6 changed files with 182 additions and 1 deletion.
diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/README.md b/examples/llm_compression/openvino/smollm2_360m_fp8/README.md
@@ -0,0 +1,26 @@
+# Large Language Models FP8 Compression Example
+
+This example demonstrates how to apply static FP8 quantization to [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct) model. It can be useful for evaluation and early HW enablement purposes.
+
+## Prerequisites
+
+To use this example:
+
+- Create a separate Python* environment and activate it: `python3 -m venv nncf_env && source nncf_env/bin/activate`
+- Install dependencies:
+
+```bash
+pip install -U pip
+pip install -r requirements.txt
+pip install ../../../../
+```
+
+## Run Example
+
+To run example:
+
+```bash
+python main.py
+```
+
+It will automatically download the dataset and baseline model and save the resulting model.
diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/main.py b/examples/llm_compression/openvino/smollm2_360m_fp8/main.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2024 Intel Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+
+import datasets
+import numpy as np
+import openvino as ov
+from optimum.intel.openvino import OVModelForCausalLM
+from transformers import AutoTokenizer
+
+import nncf
+
+
+def transform_fn(data, model, tokenizer):
+    tokenized_text = tokenizer(data["text"], return_tensors="np")
+    input_ids = tokenized_text["input_ids"]
+    attention_mask = tokenized_text["attention_mask"]
+
+    inputs = {}
+    inputs["input_ids"] = input_ids
+    inputs["attention_mask"] = tokenized_text["attention_mask"]
+    position_ids = np.cumsum(attention_mask, axis=1) - 1
+    position_ids[attention_mask == 0] = 1
+
+    # The magic forms KV cache as model inputs
+    batch_size = input_ids.shape[0]
+    for input_name in model.key_value_input_names:
+        model_inputs = model.model.input(input_name)
+        shape = model_inputs.get_partial_shape()
+        shape[0] = batch_size
+        if shape[2].is_dynamic:
+            shape[2] = 0
+        else:
+            shape[1] = 0
+        inputs[input_name] = ov.Tensor(model_inputs.get_element_type(), shape.get_shape())
+
+    inputs["position_ids"] = position_ids
+    return inputs
+
+
+def generate_answers(questions, model, tokenizer, max_new_tokens=50):
+    messages = [
+        {"role": "system", "content": "You are a chatbot who always responds as short as possible."},
+        {"role": "user", "content": "What is the capital of Spain?"},
+        {"role": "assistant", "content": "Madrid."},
+    ]
+    answers_by_questions = {}
+    model.request = None
+
+    for question in questions:
+        messages.append({"role": "user", "content": question})
+        input_ids = tokenizer.apply_chat_template(
+            messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+        ).to(device=model.device)
+        input_len = len(input_ids[0])
+
+        output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)[0]
+        answer = tokenizer.decode(output[input_len:], skip_special_tokens=True)
+        answers_by_questions[question] = answer
+        messages.append({"role": "assistant", "content": answer})
+
+    model.request = None
+    return answers_by_questions
+
+
+def main():
+    MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
+    OUTPUT_DIR = "smollm2_360m_compressed"
+
+    dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    # Filtering to remove empty samples from the dataset
+    dataset = dataset.filter(lambda example: len(example["text"]) > 1)
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = OVModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        export=True,
+        load_in_8bit=False,
+        compile=False,
+        stateful=False,
+        ov_config={"INFERENCE_PRECISION_HINT": "f32"},
+    )
+
+    questions = [
+        "What is the capital of France?",
+        "What is the highest mountain in the Alps?",
+        "What is the largest city in Canada?",
+        "What is the most visited city in Japan?",
+    ]
+
+    answers_by_questions = generate_answers(questions, model, tokenizer)
+    print(f"Non-optimized model outputs:\n{answers_by_questions}\n")
+
+    quantization_dataset = nncf.Dataset(dataset, partial(transform_fn, model=model, tokenizer=tokenizer))
+
+    model.model = nncf.quantize(
+        model.model,
+        calibration_dataset=quantization_dataset,
+        # Only PERFORMANCE preset supports in combination with FP8 quantization mode
+        preset=nncf.QuantizationPreset.PERFORMANCE,
+        mode=nncf.QuantizationMode.FP8_E4M3,
+        model_type=nncf.ModelType.TRANSFORMER,
+        # SmoothQuant algorithm is not needed for FP8 quantization
+        advanced_parameters=nncf.AdvancedQuantizationParameters(
+            smooth_quant_alphas=nncf.AdvancedSmoothQuantParameters(matmul=-1)
+        ),
+    )
+    model.save_pretrained(OUTPUT_DIR)
+    tokenizer.save_pretrained(OUTPUT_DIR)
+
+    model = OVModelForCausalLM.from_pretrained(
+        OUTPUT_DIR, ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0", "INFERENCE_PRECISION_HINT": "f32"}
+    )
+    answers_by_questions = generate_answers(questions, model, tokenizer)
+    print(f"Optimized model outputs:\n{answers_by_questions}\n")
+    return answers_by_questions
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt b/examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt
@@ -0,0 +1,5 @@
+datasets
+openvino==2024.5
+optimum-intel[openvino]
+transformers
+onnx<1.16.2
diff --git a/tests/cross_fw/examples/.test_durations b/tests/cross_fw/examples/.test_durations
@@ -13,5 +13,6 @@
     "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_ssd300_vgg16]": 231.613,
     "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_anomalib]": 478.797,
     "tests/cross_fw/examples/test_examples.py::test_examples[quantization_aware_training_torch_resnet18]": 1251.144,
-    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 412.243
+    "tests/cross_fw/examples/test_examples.py::test_examples[post_training_quantization_torch_fx_resnet18]": 412.243,
+    "tests/cross_fw/examples/test_examples.py::test_examples[fp8_llm_quantization]": 229.69
 }
diff --git a/tests/cross_fw/examples/example_scope.json b/tests/cross_fw/examples/example_scope.json
@@ -260,5 +260,18 @@
             "int8_model_size": 5.677968978881836,
             "model_compression_rate": 3.7654144877995197
         }
+    },
+    "fp8_llm_quantization": {
+        "backend": "openvino",
+        "requirements": "examples/llm_compression/openvino/smollm2_360m_fp8/requirements.txt",
+        "cpu": "Intel(R) Core(TM) i9-10980XE CPU @ 3.00GHz",
+        "accuracy_metrics": {
+            "answers": [
+                "Paris.",
+                "Mont Blanc.",
+                "Toronto.",
+                "Tokyo."
+            ]
+        }
     }
 }
diff --git a/tests/cross_fw/examples/run_example.py b/tests/cross_fw/examples/run_example.py
@@ -184,6 +184,14 @@ def llm_compression_synthetic() -> Dict[str, float]:
     return {"word_count": len(result.split())}
 
 
+def fp8_llm_quantization() -> Dict[str, float]:
+    from examples.llm_compression.openvino.smollm2_360m_fp8.main import main as fp8_llm_quantization_main
+
+    result = fp8_llm_quantization_main()
+
+    return {"answers": list(result.values())}
+
+
 def post_training_quantization_torch_fx_resnet18():
     from examples.post_training_quantization.torch_fx.resnet18.main import main as resnet18_main