From 9a18ae0119ee5e6669b42f49a6fe7ac3397ac55b Mon Sep 17 00:00:00 2001
From: rbrugaro <rita.brugarolas.brufau@intel.com>
Date: Tue, 27 Aug 2024 09:19:47 -0700
Subject: [PATCH 1/9] set cpu affinity and membind for better oob performance
 (#853)

* set num threads and memory binding for better OOB performance

* clean env var

* added core and memory binding util for improved performance

* add example usage in docstring

* change utlity for best oob to support world_size and rank >=1

* fix style

* fix node_id value to account for rank_id starts at zero

* numa node assignment calculated from local size not from world size

* reorg imports, moved checks to import_utils, remove prints for logger

* raise Errors with missing pkg and unsupported OS

* added missng env var to list

* Update optimum/intel/utils/modeling_utils.py

* Update optimum/intel/utils/import_utils.py

* Update optimum/intel/utils/import_utils.py

* fix style quality error

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 docker/Dockerfile.intel               |  9 +--
 optimum/intel/utils/__init__.py       |  1 +
 optimum/intel/utils/import_utils.py   | 12 ++++
 optimum/intel/utils/modeling_utils.py | 82 +++++++++++++++++++++++++++
 4 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel
index 60fd51b424..a7f1dc978f 100644
--- a/docker/Dockerfile.intel
+++ b/docker/Dockerfile.intel
@@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     libpng-dev \
     python3 \
     python3-pip \
+    python3-dev \
+    libnuma-dev \
     && rm -rf /var/lib/apt/lists/*"
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
@@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \
     torchaudio==${TORCHAUDIO_VERSION} \
     -f https://download.pytorch.org/whl/torch_stable.html && \
     python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
-    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python3 -m pip install --no-cache-dir  numa
 
-ARG OMP_NUM_THREADS=1
-ENV OMP_NUM_THREADS=${OMP_NUM_THREADS}
 ARG KMP_BLOCKTIME=1
 ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
 ARG KMP_HW_SUBSET=1T
 ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
-ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
\ No newline at end of file
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
diff --git a/optimum/intel/utils/__init__.py b/optimum/intel/utils/__init__.py
index d77588f896..50cdfa143e 100644
--- a/optimum/intel/utils/__init__.py
+++ b/optimum/intel/utils/__init__.py
@@ -22,6 +22,7 @@
     is_neural_compressor_available,
     is_neural_compressor_version,
     is_nncf_available,
+    is_numa_available,
     is_openvino_available,
     is_torch_version,
     is_transformers_available,
diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py
index 6be0aac47a..032280e940 100644
--- a/optimum/intel/utils/import_utils.py
+++ b/optimum/intel/utils/import_utils.py
@@ -150,6 +150,14 @@
     except importlib_metadata.PackageNotFoundError:
         _accelerate_available = False
 
+_numa_available = importlib.util.find_spec("numa") is not None
+
+if _numa_available:
+    try:
+        importlib_metadata.version("numa")
+    except importlib_metadata.PackageNotFoundError:
+        _numa_available = False
+
 
 def is_transformers_available():
     return _transformers_available
@@ -272,6 +280,10 @@ def is_accelerate_available():
     return _accelerate_available
 
 
+def is_numa_available():
+    return _numa_available
+
+
 # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
 def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
     """
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index cd5b34f86f..1d2f7b03c5 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -12,16 +12,25 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import logging
+import math
+import os
+import platform
 import re
 from pathlib import Path
 from typing import List, Optional, Union
 
+import psutil
 import torch
 from huggingface_hub import HfApi, HfFolder
 
+from .import_utils import is_numa_available
+
 
 MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"}
 
+logger = logging.getLogger(__name__)
+
 
 def get_model_device(model: torch.nn.Module) -> torch.device:
     """
@@ -135,3 +144,76 @@ def replace_customized_linear_with_linear(model):
                 setattr(model, child_name, new_m)
             else:
                 replace_customized_linear_with_linear(child)
+
+
+def get_int_from_env(env_keys, default):
+    """Returns the first positive env value found in the `env_keys` list or the default."""
+    for e in env_keys:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
+
+
+def bind_cores_for_best_perf():
+    """
+    Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance.
+    Works for wold_size >= 1 and rank >= 1
+
+    Example:
+    .. code-block:: python
+
+        from optimum.intel.ipex import IPEXModelForCausalLM
+        from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf
+
+        bind_cores_for_best_perf()
+        model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True)
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        input_sentence = ["tell me a story about a trip to the moon"]
+        model_inputs = tokenizer(input_sentence, return_tensors="pt")
+        generation_kwargs = dict(max_new_tokens=500)
+        generated_ids = model.generate(**model_inputs, **generation_kwargs)
+
+    Returns:
+        None
+
+    """
+    if platform.system() != "Linux":
+        logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
+        raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.")
+    if not is_numa_available():
+        logger.error("'numa' module not found")
+        raise ImportError("'numa' module not found, install with 'pip install numa'")
+    import numa
+
+    local_size = get_int_from_env(
+        ["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1
+    )
+    rank_id = get_int_from_env(
+        ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0
+    )
+    nodes = numa.get_max_node() + 1
+    rank_per_node = math.ceil(local_size / nodes)
+    num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes)
+    node_id = int(rank_id / rank_per_node)
+    rank_offset_per_node = rank_id % rank_per_node
+    if os.getenv("OMP_NUM_THREADS") is None:
+        num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1)
+        logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance")
+    else:
+        num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS"))
+        logger.info(f"OMP_NUM_THREADS already set to  {num_cpus_per_rank}")
+    if len(numa.get_membind()) == nodes:
+        # if numa memory binding is not set, set it to the node where the rank is running
+        numa.set_membind([node_id])
+
+    torch.set_num_threads(num_cpus_per_rank)
+
+    if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True):
+        # if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank
+        cpu_start = num_cpus_per_rank * rank_offset_per_node
+        numa.set_affinity(
+            0,
+            list(numa.node_to_cpus(node_id))[cpu_start : cpu_start + num_cpus_per_rank],
+        )
+    logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}")

From af8c28d46e2e3b589170866502093c5af34b749c Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 30 Aug 2024 15:43:45 +0400
Subject: [PATCH 2/9] Fix openvino nightly install in tests (#885)

* fix openvino nightly install in tests

* Update .github/workflows/test_openvino.yml

---------

Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
---
 .github/workflows/test_openvino.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 13a6b83e57..2262407898 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -51,7 +51,6 @@ jobs:
           pytest tests/openvino/test_modeling_basic.py
       - name: Test openvino-nightly
         run: |
-          pip uninstall -y openvino
-          pip install openvino-nightly
+          pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
           optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov

From d6e6e1f0350ef0b66dab5266196d56f3a5dd4c7c Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Fri, 30 Aug 2024 17:06:07 +0400
Subject: [PATCH 3/9] Fix attention mask for glm4 (#884)

---
 optimum/exporters/openvino/model_patcher.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 6e65f4f11a..8cb745bd72 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -308,10 +308,9 @@ def _chatglm2_core_attention_forward(self, query_layer, key_layer, value_layer,
 
 
 def _glm4_core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask):
-    attention_mask = ~attention_mask
-    context_layer = torch.nn.functional.scaled_dot_product_attention(
-        query_layer, key_layer, value_layer, attention_mask.to(torch.float32)
-    )
+    causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32)
+    causal_mask.masked_fill_(attention_mask, float("-inf"))
+    context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, causal_mask)
     context_layer = context_layer.transpose(1, 2).contiguous()
     new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
     context_layer = context_layer.reshape(*new_context_layer_shape)

From b5998f2f44e581b102ed7a9b714ac0f7c2d51a66 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Fri, 30 Aug 2024 16:11:11 +0200
Subject: [PATCH 4/9] Apply weight compression after model save to reduce peak
 RAM during export (#878)

* Initial commit

* Style

* Adopt tests

* Add no-nncf warning

* Apply suggested changes

* Do not save in fp16 in case of weight compression

* Replace model files right away
---
 optimum/exporters/openvino/__main__.py |  50 ++++++++-
 optimum/exporters/openvino/convert.py  |  38 +------
 tests/openvino/test_quantization.py    | 138 +++++++++++++------------
 3 files changed, 123 insertions(+), 103 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 77f8049606..c4b6ef0cd8 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -14,7 +14,9 @@
 
 import gc
 import logging
+import operator
 import warnings
+from functools import reduce
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
@@ -23,18 +25,20 @@
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
 from transformers.utils import is_torch_available
 
+from openvino.runtime import Core, Type, save_model
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.openvino.convert import export_from_model
 from optimum.intel.utils.import_utils import (
+    is_nncf_available,
     is_openvino_tokenizers_available,
     is_openvino_version,
     is_transformers_version,
 )
 from optimum.utils.save_utils import maybe_load_preprocessors
 
-from .utils import clear_class_registry
+from .utils import _MAX_UNCOMPRESSED_SIZE, clear_class_registry
 
 
 if TYPE_CHECKING:
@@ -402,7 +406,7 @@ class StoreAttr(object):
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
 
-    export_from_model(
+    submodel_paths = export_from_model(
         model=model,
         output=output,
         task=task,
@@ -425,6 +429,48 @@ class StoreAttr(object):
     del model
     gc.collect()
 
+    core = Core()
+    for submodel_path in submodel_paths:
+        submodel_path = Path(output) / submodel_path
+        submodel = core.read_model(submodel_path)
+
+        quantization_config = None
+        if ov_config is None:
+            num_parameters = 0
+            for op in submodel.get_ops():
+                if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]:
+                    num_parameters += reduce(operator.mul, op.shape, 1)
+            if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
+                if is_nncf_available():
+                    quantization_config = {"bits": 8, "sym": False}
+                    logger.info("The model weights will be quantized to int8_asym.")
+                else:
+                    logger.warning(
+                        "The model will be converted with no weights quantization. Quantization of the weights to int8 "
+                        "requires nncf. Please install it with `pip install nncf`"
+                    )
+                    break
+        else:
+            quantization_config = ov_config.quantization_config
+        if quantization_config is None:
+            continue
+
+        if not is_nncf_available():
+            raise ImportError("Quantization of the weights requires nncf, please install it with `pip install nncf`")
+
+        from optimum.intel.openvino.quantization import _weight_only_quantization
+
+        _weight_only_quantization(submodel, quantization_config)
+
+        compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
+        save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
+        del submodel
+
+        submodel_path.unlink()
+        submodel_path.with_suffix(".bin").unlink()
+        compressed_submodel_path.rename(submodel_path)
+        compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin"))
+
     # Unpatch modules after GPTQ export
     if do_gptq_patching:
         torch.cuda.is_available = orig_cuda_check
diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
index 0b937734ce..dc2af68784 100644
--- a/optimum/exporters/openvino/convert.py
+++ b/optimum/exporters/openvino/convert.py
@@ -49,7 +49,6 @@
 from .model_patcher import patch_model_with_bettertransformer
 from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful
 from .utils import (
-    _MAX_UNCOMPRESSED_SIZE,
     OV_XML_FILE_NAME,
     clear_class_registry,
     flattenize_inputs,
@@ -76,21 +75,7 @@
 
 
 def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
-    compress_to_fp16 = False
-
-    if ov_config is not None:
-        if ov_config.quantization_config:
-            if not is_nncf_available():
-                raise ImportError(
-                    "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
-                )
-
-            from optimum.intel.openvino.quantization import _weight_only_quantization
-
-            _weight_only_quantization(model, ov_config.quantization_config)
-
-        compress_to_fp16 = ov_config.dtype == "fp16"
-
+    compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
     model = _add_version_info_to_model(model, library_name)
     save_model(model, path, compress_to_fp16)
 
@@ -643,25 +628,6 @@ def export_from_model(
     )
     logging.disable(logging.NOTSET)
 
-    if ov_config is None:
-        if library_name == "diffusers":
-            num_parameters = model.unet.num_parameters()
-        else:
-            num_parameters = sum(param.numel() for param in list(model.parameters()) if param.requires_grad)
-
-        if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
-            if is_nncf_available():
-                from ...intel.openvino.configuration import OVConfig
-
-                ov_config = OVConfig(quantization_config={"bits": 8, "sym": False})
-
-                logger.info("The model weights will be quantized to int8_asym.")
-            else:
-                logger.warning(
-                    "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
-                    "please install it with `pip install nncf`"
-                )
-
     if library_name != "diffusers":
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
@@ -720,6 +686,8 @@ def export_from_model(
         patch_16bit_model=patch_16bit_model,
     )
 
+    return files_subpaths
+
 
 def export_tokenizer(
     tokenizer,
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 23ff3a03ca..5835bc76a2 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import inspect
 
 # ruff: noqa
 
@@ -22,6 +23,7 @@
 from enum import Enum
 from functools import partial
 from typing import Union
+
 import pytest
 import evaluate
 import numpy as np
@@ -538,76 +540,80 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
             self.assertEqual(0, num_int8)
 
     def test_ovmodel_load_large_model_with_default_compressed_weights(self):
-        with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters:
-            mock_tensor = unittest.mock.Mock()
-            mock_tensor.numel = lambda: 2000000000
-            mock_tensor.requires_grad = True
-            model_parameters.return_value = [mock_tensor]
-            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
-                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
-                    _ = OVModelForCausalLM.from_pretrained(
-                        MODEL_NAMES["llama"], export=True, compile=False, use_cache=False
-                    )
-                    save_model_patch.assert_called_with(
-                        unittest.mock.ANY,
-                        unittest.mock.ANY,
-                        ov_config=OVConfig(quantization_config={"bits": 8}),
-                        library_name="transformers",
-                    )
+        def main_export_in_stacktrace(*args, **kwargs):
+            # Compression was called from `main_export`
+            self.assertTrue(inspect.stack()[5].function == "main_export")
+
+        with unittest.mock.patch(
+            "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
+        ) as ov_constant_shape:
+            ov_constant_shape.return_value = (2000000000,)
+            with unittest.mock.patch(
+                "nncf.compress_weights", side_effect=main_export_in_stacktrace
+            ) as compress_weights_patch:
+                _ = OVModelForCausalLM.from_pretrained(
+                    MODEL_NAMES["llama"], export=True, compile=False, use_cache=False
+                )
+                compression_params = {
+                    "mode": nncf.CompressWeightsMode.INT8_ASYM,
+                    "ratio": 1.0,
+                    "group_size": -1,
+                    "all_layers": None,
+                    "sensitivity_metric": None,
+                    "dataset": None,
+                    "ignored_scope": nncf.IgnoredScope(),
+                    "awq": None,
+                    "subset_size": 128,
+                    "scale_estimation": None,
+                }
+                compress_weights_patch.assert_called_with(
+                    unittest.mock.ANY,
+                    **compression_params,
+                )
 
     def test_ovmodel_load_large_model_with_uncompressed_weights(self):
-        with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters:
-            mock_tensor = unittest.mock.Mock()
-            mock_tensor.numel = lambda: 2000000000
-            mock_tensor.requires_grad = True
-            model_parameters.return_value = [mock_tensor]
-            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
-                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
-                    _ = OVModelForCausalLM.from_pretrained(
-                        MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False
-                    )
-                    save_model_patch.assert_called_with(
-                        unittest.mock.ANY,
-                        unittest.mock.ANY,
-                        ov_config=OVConfig(dtype="auto"),
-                        library_name="transformers",
-                    )
+        with unittest.mock.patch(
+            "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
+        ) as ov_constant_shape:
+            ov_constant_shape.return_value = (2000000000,)
+            with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch:
+                _ = OVModelForCausalLM.from_pretrained(
+                    MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False
+                )
+                compress_weights_patch.assert_not_called()
 
     def test_ovmodel_load_large_model_with_additional_quantization_config(self):
-        with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters:
-            mock_tensor = unittest.mock.Mock()
-            mock_tensor.numel = lambda: 2000000000
-            mock_tensor.requires_grad = True
-            with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch:
-                with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch:
-                    with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch:
-                        _ = OVModelForCausalLM.from_pretrained(
-                            MODEL_NAMES["llama"],
-                            export=True,
-                            compile=False,
-                            use_cache=False,
-                            quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8),
-                        )
-                        # quantization will be performed later, using load_model
-                        save_model_patch.assert_called_with(
-                            unittest.mock.ANY,
-                            unittest.mock.ANY,
-                            ov_config=OVConfig(dtype="auto"),
-                            library_name="transformers",
-                        )
-                        compression_params = {
-                            "mode": nncf.CompressWeightsMode.INT4_SYM,
-                            "ratio": 0.8,
-                            "group_size": -1,
-                            "all_layers": None,
-                            "sensitivity_metric": None,
-                            "dataset": None,
-                            "ignored_scope": nncf.IgnoredScope(),
-                            "awq": None,
-                            "subset_size": 128,
-                            "scale_estimation": None,
-                        }
-                        compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)
+        def main_export_not_in_stacktrace(*args, **kwargs):
+            # Compression was not called from `main_export`
+            self.assertTrue(all(frame_info.function != "main_export" for frame_info in inspect.stack()))
+
+        with unittest.mock.patch(
+            "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock
+        ) as ov_constant_shape:
+            ov_constant_shape.return_value = (2000000000,)
+            with unittest.mock.patch(
+                "nncf.compress_weights", side_effect=main_export_not_in_stacktrace
+            ) as compress_weights_patch:
+                _ = OVModelForCausalLM.from_pretrained(
+                    MODEL_NAMES["llama"],
+                    export=True,
+                    compile=False,
+                    use_cache=False,
+                    quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8),
+                )
+                compression_params = {
+                    "mode": nncf.CompressWeightsMode.INT4_SYM,
+                    "ratio": 0.8,
+                    "group_size": -1,
+                    "all_layers": None,
+                    "sensitivity_metric": None,
+                    "dataset": None,
+                    "ignored_scope": nncf.IgnoredScope(),
+                    "awq": None,
+                    "subset_size": 128,
+                    "scale_estimation": None,
+                }
+                compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantization_config, expected_ov_int4):

From 9a8782446e394ac07283b8bd8b44916c4f297826 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Mon, 2 Sep 2024 18:23:53 +0200
Subject: [PATCH 5/9] Introduce support for `mxfp4` data type for OV weight
 compression (#882)

* Add support for mxfp4_e2m1 data type for OV weight compression

* Add new tests for mxfp4_e2m1. Adopt tests structure for new data type.

* Style

* Rename dtype to weight_format

* Fix descriptions

* Replace 'mxfp4_e2m1' with 'mxfp4'

* Add checks for mxfp4 weight format

* Dataset is possible in case of INT8 HQ

* Address comments
---
 optimum/commands/export/openvino.py     |  19 ++---
 optimum/intel/openvino/configuration.py |  41 ++++++++-
 optimum/intel/openvino/quantization.py  |   9 +-
 tests/openvino/test_exporters_cli.py    |  37 +++-----
 tests/openvino/test_quantization.py     | 109 +++++++++++++-----------
 tests/openvino/utils_tests.py           |  23 +++--
 6 files changed, 138 insertions(+), 100 deletions(-)

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 742612ca35..5f6c209df6 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -70,9 +70,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--weight-format",
         type=str,
-        choices=["fp32", "fp16", "int8", "int4", "int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"],
+        choices=["fp32", "fp16", "int8", "int4", "mxfp4"],
         default=None,
-        help="he weight format of the exported model.",
+        help="The weight format of the exported model.",
     )
     optional_group.add_argument(
         "--library",
@@ -255,12 +255,11 @@ def run(self):
         elif self.args.weight_format in {"fp16", "fp32"}:
             ov_config = OVConfig(dtype=self.args.weight_format)
         else:
-            is_int8 = self.args.weight_format == "int8"
-
-            # For int4 quantization if no parameter is provided, then use the default config if exist
-            if no_compression_parameter_provided(self.args) and not is_int8:
+            # For int4 quantization if no parameter is provided, then use the default config if exists
+            if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4":
                 quantization_config = get_default_int4_config(self.args.model)
             else:
+                is_int8 = self.args.weight_format == "int8"
                 quantization_config = {
                     "bits": 8 if is_int8 else 4,
                     "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
@@ -272,17 +271,11 @@ def run(self):
                     "quant_method": "awq" if self.args.awq else "default",
                     "sensitivity_metric": self.args.sensitivity_metric,
                     "scale_estimation": self.args.scale_estimation,
+                    "weight_format": self.args.weight_format,
                 }
 
             if quantization_config.get("dataset", None) is not None:
                 quantization_config["trust_remote_code"] = self.args.trust_remote_code
-
-            if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
-                logger.warning(
-                    f"--weight-format {self.args.weight_format} is deprecated, possible choices are fp32, fp16, int8, int4"
-                )
-                quantization_config["sym"] = "asym" not in self.args.weight_format
-                quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
             ov_config = OVConfig(quantization_config=quantization_config)
 
         quantization_config = ov_config.quantization_config if ov_config else None
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index aaaca031b2..ed9638e18c 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -312,6 +312,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
         scale_estimation (`bool`, *optional*):
             Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
             compressed layers. Providing a dataset is required to run scale estimation.
+        weight_format (`str`, defaults to 'int'):
+            Data format weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4'].
     """
 
     def __init__(
@@ -329,6 +331,7 @@ def __init__(
         num_samples: Optional[int] = None,
         quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT,
         scale_estimation: bool = None,
+        weight_format: Optional[str] = None,
         **kwargs,
     ):
         super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples)
@@ -341,6 +344,7 @@ def __init__(
         self.sensitivity_metric = sensitivity_metric
         self.quant_method = OVQuantizationMethod(quant_method) if isinstance(quant_method, str) else quant_method
         self.scale_estimation = scale_estimation
+        self.weight_format = weight_format
         self.post_init()
 
     def post_init(self):
@@ -382,10 +386,38 @@ def post_init(self):
                 raise ValueError(
                     f"For 8-bit quantization, `group_size` is expected to be set to -1, but was set to {self.group_size}"
                 )
+            if self.all_layers:
+                raise ValueError("The `all_layers` parameter is not supported for 8-bit quantization")
+            if self.sensitivity_metric:
+                raise ValueError("The `sensitivity_metric` parameter is not supported for 8-bit quantization")
+            if self.quant_method == OVQuantizationMethod.AWQ:
+                raise ValueError(
+                    "The AWQ algorithm is not supported for 8-bit quantization and got `quant_method='awq'`, please update accordingly"
+                )
+            if self.scale_estimation:
+                raise ValueError(
+                    "The Scale Estimation algorithm is not supported for 8-bit quantization and got `scale_estimation=True`, please set `scale_estimation=False`"
+                )
 
         if self.tokenizer is not None and not isinstance(self.tokenizer, str):
             raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}")
 
+        if self.weight_format is None:
+            self.weight_format = "int4" if self.bits == 4 else "int8"
+        if self.weight_format not in ["int4", "int8", "mxfp4"]:
+            raise ValueError(
+                f"Weight format must be one of the following: ['int4', 'int8', 'mxfp4'], but found: {self.weight_format}."
+            )
+        if self.weight_format == "mxfp4":
+            if self.bits != 4:
+                raise ValueError(
+                    f"When applying weight compression with 'mxfp4' weight format the `bits` parameters must be set to 4, but found {self.bits}"
+                )
+            if self.quant_method == OVQuantizationMethod.AWQ:
+                raise ValueError("The AWQ algorithm is not supported for 'mxfp4' weight format")
+            if self.scale_estimation:
+                raise ValueError("The Scale Estimation algorithm is not supported for 'mxfp4' weight format")
+
 
 @dataclass
 class OVDynamicQuantizationConfig(OVWeightQuantizationConfig):
@@ -473,8 +505,13 @@ def __init__(
         self.compression = kwargs.get(
             "compression", None
         )  # A field for backward-compatability of training-time compression parameters
-        bits = self.quantization_config.bits if self.quantization_config else None
-        self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype
+        if self.quantization_config is not None:
+            if isinstance(self.quantization_config, OVWeightQuantizationConfig):
+                self.dtype = self.quantization_config.weight_format
+            else:
+                self.dtype = "int8"
+        else:
+            self.dtype = dtype
 
     def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False):
         self.input_info = [
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index df9d496de7..c858c3a63a 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -811,10 +811,13 @@ def _weight_only_quantization(
     if isinstance(config.sensitivity_metric, str):
         sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper())
 
-    if config.bits == 8:
-        mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM
+    if config.weight_format == "mxfp4":
+        mode = CompressWeightsMode.E2M1
     else:
-        mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM
+        if config.bits == 8:
+            mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM
+        else:
+            mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM
 
     return nncf.compress_weights(
         model,
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index b5aff8d175..6380a52881 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -89,31 +89,22 @@ class OVCLIExportTestCase(unittest.TestCase):
     )
 
     TEST_4BIT_CONFIGURATONS = [
-        ("text-generation-with-past", "opt125m", "int4_sym_g128", 4, 72),
-        ("text-generation-with-past", "opt125m", "int4_asym_g128", 4, 144),
-        ("text-generation-with-past", "opt125m", "int4_sym_g64", 4, 72),
-        ("text-generation-with-past", "opt125m", "int4_asym_g64", 4, 144),
-        (
-            "text-generation-with-past",
-            "llama_awq",
-            "int4 --ratio 1.0 --sym --group-size 8 --all-layers",
-            0,
-            16,
-        ),
+        ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}),
+        ("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}),
+        ("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}),
+        ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 "
             "--sensitivity-metric max_activation_variance",
-            4,
-            14,
+            {"int8": 4, "int4": 14},
         ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
-            4,
-            14,
+            {"int8": 4, "int4": 14},
         ),
     ]
 
@@ -219,8 +210,8 @@ def test_exporters_cli_int8(self, task: str, model_type: str):
 
             expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
             for i, model in enumerate(models):
-                _, num_int8, _ = get_num_quantized_nodes(model)
-                self.assertEqual(expected_int8[i], num_int8)
+                _, num_weight_nodes = get_num_quantized_nodes(model)
+                self.assertEqual(expected_int8[i], num_weight_nodes["int8"])
 
     @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES)
     def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int):
@@ -231,12 +222,12 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in
                 check=True,
             )
             model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir)
-            num_fq, num_int8, _ = get_num_quantized_nodes(model.unet)
-            self.assertEqual(exp_num_int8, num_int8)
+            num_fq, num_weight_nodes = get_num_quantized_nodes(model.unet)
+            self.assertEqual(exp_num_int8, num_weight_nodes["int8"])
             self.assertEqual(exp_num_fq, num_fq)
 
     @parameterized.expand(TEST_4BIT_CONFIGURATONS)
-    def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int):
+    def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict):
         with TemporaryDirectory() as tmpdir:
             result = subprocess.run(
                 f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
@@ -251,9 +242,9 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec
                 else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]
             ).from_pretrained(tmpdir, **model_kwargs)
 
-            _, num_int8, num_int4 = get_num_quantized_nodes(model)
-            self.assertEqual(expected_int8, num_int8)
-            self.assertEqual(expected_int4, num_int4)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
+            self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout)
             self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout)
 
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 5835bc76a2..c263000f18 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -124,9 +124,9 @@ def preprocess_function(examples, tokenizer):
                 ov_config=ov_config,
             )
             model = model_cls.from_pretrained(tmp_dir, file_name=file_name)
-            num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model)
+            num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model)
             self.assertEqual(expected_fake_quantize, num_fake_quantize)
-            self.assertEqual(expected_int8, num_int8)
+            self.assertEqual(expected_int8, num_weight_nodes["int8"])
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
@@ -165,9 +165,9 @@ def preprocess_function(examples, tokenizer):
 
             model = model_cls.from_pretrained(tmp_dir)
 
-            num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model)
+            num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model)
             self.assertEqual(expected_fake_quantize, num_fake_quantize)
-            self.assertEqual(expected_int8, num_int8)
+            self.assertEqual(expected_int8, num_weight_nodes["int8"])
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
@@ -189,11 +189,12 @@ class OVWeightCompressionTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),)
 
     LOAD_IN_4_BITS_SCOPE = (
+        (OVModelForCausalLM, "gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8), {"int4": 30, "int8": 14}),
         (
             OVModelForCausalLM,
             "gpt2",
-            dict(bits=4, sym=False, group_size=-1, ratio=0.8),
-            14,
+            dict(bits=4, weight_format="mxfp4", group_size=32),
+            {"f4e2m1": 20, "f8e8m0": 20, "int8": 4},
         ),
         (
             OVModelForCausalLM,
@@ -204,13 +205,13 @@ class OVWeightCompressionTest(unittest.TestCase):
                 group_size=32,
                 ignored_scope={"names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"]},
             ),
-            4,
+            {"int4": 38, "int8": 4},
         ),
         (
             OVModelForCausalLM,
             "gpt2",
             dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True),
-            18,
+            {"int4": 26, "int8": 18},
         ),
         (
             OVModelForCausalLM,
@@ -223,7 +224,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset="c4",
             ),
-            14,
+            {"int4": 25, "int8": 14},
         ),
         (
             OVModelForCausalLM,
@@ -236,7 +237,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 sensitivity_metric="mean_activation_magnitude",
                 dataset=["one two, " * i for i in range(10)],
             ),
-            14,
+            {"int4": 25, "int8": 14},
         ),
         (
             OVModelForCausalLM,
@@ -251,7 +252,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 quant_method=QuantizationMethod.AWQ,
                 scale_estimation=True,
             ),
-            8,
+            {"int4": 12, "int8": 8},
         ),
         (
             OVModelForCausalLM,
@@ -265,7 +266,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 dataset="c4",
                 quant_method="awq",
             ),
-            8,
+            {"int4": 12, "int8": 8},
         ),
     )
 
@@ -308,8 +309,8 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i
             quantizer.quantize(save_directory=tmp_dir)
             model = model_cls.from_pretrained(tmp_dir)
 
-            _, num_int8, _ = get_num_quantized_nodes(model)
-            self.assertEqual(expected_pt_int8, num_int8)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            self.assertEqual(expected_pt_int8, num_weight_nodes["int8"])
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
@@ -339,8 +340,8 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p
             quantizer.quantize(save_directory=tmp_dir)
             model = model_cls.from_pretrained(tmp_dir)
 
-            _, num_int8, _ = get_num_quantized_nodes(model)
-            self.assertEqual(expected_ov_int8, num_int8)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
@@ -365,9 +366,9 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i
             quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config)
             model = model_cls.from_pretrained(tmp_dir)
 
-            _, num_int8, num_int4 = get_num_quantized_nodes(model)
-            self.assertEqual(expected_int8, num_int8)
-            self.assertEqual(expected_int4, num_int4)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            self.assertEqual(expected_int8, num_weight_nodes["int8"])
+            self.assertEqual(expected_int4, num_weight_nodes["int4"])
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
@@ -391,8 +392,8 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e
             quantizer.quantize(save_directory=tmp_dir)
             model = model_cls.from_pretrained(tmp_dir)
 
-            _, num_int8, _ = get_num_quantized_nodes(model)
-            self.assertEqual(expected_ov_int8, num_int8)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
@@ -418,8 +419,8 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type):
 
         expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type]
         for i, model in enumerate(models):
-            _, num_int8, _ = get_num_quantized_nodes(model)
-            self.assertEqual(expected_ov_int8[i], num_int8)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"])
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION)
     def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8):
@@ -428,10 +429,10 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f
         with tempfile.TemporaryDirectory() as tmp_dir:
             model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config)
 
-            num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
+            num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet)
             self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
-            self.assertEqual(expected_ov_int8, num_int8)
-            self.assertEqual(0, num_int4)
+            self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
+            self.assertEqual(0, num_weight_nodes["int4"])
 
             model.save_pretrained(tmp_dir)
 
@@ -442,10 +443,10 @@ def test_stable_diffusion_with_weight_compression(self):
 
         quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config))
 
-        num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(int8_pipe.unet)
+        num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(int8_pipe.unet)
         self.assertEqual(0, num_fake_quantize)
-        self.assertEqual(242, num_int8)
-        self.assertEqual(0, num_int4)
+        self.assertEqual(242, num_weight_nodes["int8"])
+        self.assertEqual(0, num_weight_nodes["int4"])
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:])
     def test_ovmodel_hybrid_quantization_with_custom_dataset(
@@ -461,10 +462,10 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset(
         self.assertEqual(quantization_config.quant_method, OVQuantizationMethod.HYBRID)
 
         quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset)
-        num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet)
+        num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet)
         self.assertEqual(expected_num_fake_quantize, num_fake_quantize)
-        self.assertEqual(expected_ov_int8, num_int8)
-        self.assertEqual(0, num_int4)
+        self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
+        self.assertEqual(0, num_weight_nodes["int4"])
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS)
     @unittest.mock.patch.dict(
@@ -478,9 +479,9 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
 
-            _, num_int8, num_int4 = get_num_quantized_nodes(model)
-            self.assertEqual(expected_ov_int4, num_int4)
-            self.assertEqual(expected_ov_int8, num_int8)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            self.assertEqual(expected_ov_int4, num_weight_nodes["int4"])
+            self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
             model.save_pretrained(tmp_dir)
 
             openvino_config = OVConfig.from_pretrained(tmp_dir)
@@ -492,7 +493,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
     def test_ovmodel_4bit_auto_compression_with_config(
-        self, model_cls, model_name, quantization_config, expected_ov_int4
+        self, model_cls, model_name, quantization_config, expected_num_weight_nodes
     ):
         model_id = MODEL_NAMES[model_name]
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -506,13 +507,14 @@ def test_ovmodel_4bit_auto_compression_with_config(
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
 
-            _, num_int4, _ = get_num_quantized_nodes(model)
-            self.assertEqual(expected_ov_int4, num_int4)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
+            self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
 
             openvino_config = OVConfig.from_pretrained(tmp_dir)
             self.assertEqual(openvino_config.quantization_config.bits, 4)
-            self.assertEqual(openvino_config.dtype, "int4")
+            self.assertEqual(openvino_config.dtype, quantization_config.weight_format)
 
     @parameterized.expand(((OVModelForCausalLM, "gpt2"),))
     def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_type):
@@ -521,8 +523,8 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty
         self.assertTrue(model.use_cache)
 
         expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type][0]
-        _, num_int8, _ = get_num_quantized_nodes(model)
-        self.assertEqual(expected_ov_int8, num_int8)
+        _, num_weight_nodes = get_num_quantized_nodes(model)
+        self.assertEqual(expected_ov_int8, num_weight_nodes["int8"])
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION)
     def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
@@ -536,8 +538,8 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type):
             models = [model]
 
         for i, model in enumerate(models):
-            _, num_int8, _ = get_num_quantized_nodes(model)
-            self.assertEqual(0, num_int8)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            self.assertEqual(0, num_weight_nodes["int8"])
 
     def test_ovmodel_load_large_model_with_default_compressed_weights(self):
         def main_export_in_stacktrace(*args, **kwargs):
@@ -616,7 +618,9 @@ def main_export_not_in_stacktrace(*args, **kwargs):
                 compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params)
 
     @parameterized.expand(LOAD_IN_4_BITS_SCOPE)
-    def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantization_config, expected_ov_int4):
+    def test_ovmodel_4bit_dynamic_with_config(
+        self, model_cls, model_name, quantization_config, expected_num_weight_nodes
+    ):
         model_id = MODEL_NAMES[model_name]
         with tempfile.TemporaryDirectory() as tmp_dir:
             group_size = quantization_config.pop("group_size", 32)
@@ -631,13 +635,14 @@ def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantizat
             if tokenizer.pad_token is None:
                 tokenizer.pad_token = tokenizer.eos_token
 
-            _, num_int4, _ = get_num_quantized_nodes(model)
-            self.assertEqual(expected_ov_int4, num_int4)
+            _, num_weight_nodes = get_num_quantized_nodes(model)
+            expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)})
+            self.assertEqual(expected_num_weight_nodes, num_weight_nodes)
             model.save_pretrained(tmp_dir)
 
             openvino_config = OVConfig.from_pretrained(tmp_dir)
             self.assertEqual(openvino_config.quantization_config.bits, 4)
-            self.assertEqual(openvino_config.dtype, "int4")
+            self.assertEqual(openvino_config.dtype, quantization_config.weight_format)
 
 
 class OVQuantizerQATest(unittest.TestCase):
@@ -764,9 +769,9 @@ def compute_metrics(p):
             trainer.save_model()
 
             model = OVModelForSequenceClassification.from_pretrained(tmp_dir)
-            num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model)
+            num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model)
             self.assertEqual(expected_fake_quantize, num_fake_quantize)
-            self.assertEqual(expected_int8, num_int8)
+            self.assertEqual(expected_int8, num_weight_nodes["int8"])
 
             tokens = tokenizer("This is a sample input", return_tensors="pt")
             outputs = model(**tokens)
@@ -793,7 +798,7 @@ class OVQuantizationConfigTest(unittest.TestCase):
                 quant_method=OVQuantizationMethod.DEFAULT,
             ),
         ),
-        (OVWeightQuantizationConfig(dataset=["hello world", "i'm alive"]),),
+        (OVWeightQuantizationConfig(bits=4, dataset=["hello world", "i'm alive"]),),
         (
             OVQuantizationConfig(
                 ignored_scope={"names": ["op_name"]},
@@ -836,7 +841,7 @@ class OVQuantizationConfigTest(unittest.TestCase):
         (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"),
         (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"),
         (
-            dict(bits=8, fast_bias_correction=True, dataset="wikitext2"),
+            dict(bits=4, fast_bias_correction=True, dataset="wikitext2"),
             OVWeightQuantizationConfig,
             "Can't determine type of OV quantization config",
         ),
@@ -858,7 +863,7 @@ class OVQuantizationConfigTest(unittest.TestCase):
         (dict(abc="def", weight_only=False), OVQuantizationConfig, None),
         (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None),
         (
-            dict(bits=8, fast_bias_correction=True, dataset="wikitext2", weight_only=True),
+            dict(bits=4, fast_bias_correction=True, dataset="wikitext2", weight_only=True),
             OVWeightQuantizationConfig,
             None,
         ),
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index f13723eef5..86a0a51e80 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -167,14 +167,23 @@
 
 def get_num_quantized_nodes(ov_model):
     num_fake_quantize = 0
-    num_int8 = 0
-    num_int4 = 0
+    num_weight_nodes = {
+        "int8": 0,
+        "int4": 0,
+        "f4e2m1": 0,
+        "f8e8m0": 0,
+    }
     for elem in ov_model.model.get_ops():
         if "FakeQuantize" in elem.name:
             num_fake_quantize += 1
         for i in range(elem.get_output_size()):
-            if elem.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
-                num_int8 += 1
-            if elem.get_output_element_type(i).get_type_name() in ["i4", "u4"]:
-                num_int4 += 1
-    return num_fake_quantize, num_int8, num_int4
+            type_name = elem.get_output_element_type(i).get_type_name()
+            if type_name in ["i8", "u8"]:
+                num_weight_nodes["int8"] += 1
+            if type_name in ["i4", "u4"]:
+                num_weight_nodes["int4"] += 1
+            if type_name == "f4e2m1":
+                num_weight_nodes["f4e2m1"] += 1
+            if type_name == "f8e8m0":
+                num_weight_nodes["f8e8m0"] += 1
+    return num_fake_quantize, num_weight_nodes

From d4e3128300eecba2bbded4a2fa2fa7a1bbbc78e6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Tue, 3 Sep 2024 17:33:16 +0800
Subject: [PATCH 6/9] Deprecate ipex inference mode (#837)

* rm ipex inference

* rm inference mode
---
 optimum/intel/__init__.py                 |   2 -
 optimum/intel/ipex/__init__.py            |   2 -
 optimum/intel/ipex/inference.py           | 158 ----------------------
 optimum/intel/utils/dummy_ipex_objects.py |   7 -
 tests/ipex/test_inference.py              | 117 ----------------
 5 files changed, 286 deletions(-)
 delete mode 100644 optimum/intel/ipex/inference.py
 delete mode 100644 tests/ipex/test_inference.py

diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
index 2f2d1cb669..7f76c28543 100644
--- a/optimum/intel/__init__.py
+++ b/optimum/intel/__init__.py
@@ -44,7 +44,6 @@
     ]
 else:
     _import_structure["ipex"] = [
-        "inference_mode",
         "IPEXModelForCausalLM",
         "IPEXModelForSequenceClassification",
         "IPEXModelForMaskedLM",
@@ -196,7 +195,6 @@
             IPEXModelForQuestionAnswering,
             IPEXModelForSequenceClassification,
             IPEXModelForTokenClassification,
-            inference_mode,
         )
 
     try:
diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py
index 83943176b2..c1f711acfc 100644
--- a/optimum/intel/ipex/__init__.py
+++ b/optimum/intel/ipex/__init__.py
@@ -22,5 +22,3 @@
     IPEXModelForSequenceClassification,
     IPEXModelForTokenClassification,
 )
-
-from .inference import inference_mode
diff --git a/optimum/intel/ipex/inference.py b/optimum/intel/ipex/inference.py
deleted file mode 100644
index a628ebe12e..0000000000
--- a/optimum/intel/ipex/inference.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#  Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# ruff: noqa
-
-import logging
-from typing import Union
-
-import torch
-from torch import nn
-from transformers import add_start_docstrings
-from transformers.pipelines import Pipeline
-from transformers.utils import is_ipex_available
-
-from ...exporters.tasks import TasksManager
-from ..generation.modeling import jit_trace
-from .modeling_base import (
-    IPEXModel,
-    IPEXModelForCausalLM,
-    IPEXModelForMaskedLM,
-    IPEXModelForSequenceClassification,
-    IPEXModelForTokenClassification,
-    IPEXModelForQuestionAnswering,
-)
-
-
-from .utils import _HEAD_TO_AUTOMODELS
-
-
-logger = logging.getLogger(__name__)
-
-IPEX_NOT_AVAILABLE_ERROR_MSG = (
-    "Intel PyTorch Extensions was not found."
-    "please make sure you've installed the package or run "
-    "pip install intel_extension_for_pytorch"
-)
-
-if is_ipex_available():
-    import intel_extension_for_pytorch as ipex
-
-
-class _ModelFallbackWrapper:
-    __slots__ = ("_optimized", "_default")
-
-    def __init__(self, optimized, default):
-        self._optimized = optimized
-        self._default = default
-
-    def __call__(self, *args, **kwargs):
-        try:
-            return self._optimized(*args, **kwargs)
-        except Exception:
-            return self._default(*args, **kwargs)
-
-    def __getattr__(self, item):
-        if not item.startswith("__"):
-            return getattr(self._default, item)
-        else:
-            return self.item
-
-
-@add_start_docstrings(
-    """
-    inference_mode is an Intel specific context-manager analogous to PyTorch's inference_mode to use for inference
-    workload on Intel CPUs, especially Intel Xeon Scalable CPUs.
-    """,
-)
-class inference_mode:
-    __slots__ = ("_model", "_dtype", "_graph_mode", "_verbose", "_original", "_jit")
-
-    def __init__(
-        self,
-        model: Union[nn.Module, Pipeline],
-        dtype: torch.dtype = torch.float32,
-        **kwargs,
-    ):
-        """
-        Args:
-            model (`torch.nn.Module` or `transformers.Pipeline`):
-                The model or pipeline instance to optimize.
-            dtype (`torch.dtype = torch.float32`), *optional*):
-                The data type used to do the computation.
-                Acceptable type are `torch.float32` (default) and `torch.bfloat16`.
-                Please note `torch.bfloat16` requires `avx512_bf16` instructions set as present on
-                4th Generation of Intel Xeon Scalable CPUs (Sapphire Rapids).
-            jit (`boolean = False`, *optional*):
-                Enable jit to accelerate inference speed
-        """
-        logger.warning(
-            "`inference_mode` is deprecated and will be removed in v1.18.0. Use `pipeline` to load and export your model to TorchScript instead."
-        )
-
-        if not is_ipex_available():
-            raise ImportError(IPEX_NOT_AVAILABLE_ERROR_MSG)
-
-        self._model = model
-        self._dtype = dtype
-        self._graph_mode = False  # Let's keep for future use when it doesn't hang anymore
-        self._original = None
-
-        if "jit" in kwargs:
-            logger.warning(
-                "`jit` is deprecated and will be removed in a future version. Use `IPEXModel` to load and export your model to TorchScript instead."
-            )
-        self._jit = kwargs.pop("jit", False)
-
-    def __enter__(self):
-        if self._model.framework == "pt":
-            with torch.inference_mode():
-                try:
-                    ipex.enable_onednn_fusion(True)
-
-                    self._original = self._model.model if isinstance(self._model, Pipeline) else self._model
-                    model = ipex.optimize(
-                        self._original,
-                        dtype=self._dtype,
-                        graph_mode=self._graph_mode,
-                        level="O1",
-                        auto_kernel_selection=True,
-                    )
-                    if self._jit:
-                        use_cache = getattr(self._original.config, "use_cache", False)
-                        task = (
-                            self._model.task
-                            if isinstance(self._model, Pipeline)
-                            else TasksManager._infer_task_from_model_or_model_class(model)
-                        )
-                        if task in _HEAD_TO_AUTOMODELS:
-                            model = jit_trace(model, task, use_cache)
-                            auto_model_class = eval(_HEAD_TO_AUTOMODELS[task])
-                            model = auto_model_class(model, self._original.config, use_cache=use_cache)
-
-                    # Enable automatic mixed precision (AMP) if we are going to target `bfloat16`
-                    with torch.cpu.amp.autocast(enabled=self._dtype == torch.bfloat16):
-                        if isinstance(self._model, Pipeline):
-                            # Patching model with the new one
-                            self._model.model = _ModelFallbackWrapper(model, self._original)
-                            return self._model
-                        return model
-
-                except RuntimeError:
-                    return self._model
-        else:
-            return self._model
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self._model = self._original
diff --git a/optimum/intel/utils/dummy_ipex_objects.py b/optimum/intel/utils/dummy_ipex_objects.py
index c451dd3956..4bd7eee630 100644
--- a/optimum/intel/utils/dummy_ipex_objects.py
+++ b/optimum/intel/utils/dummy_ipex_objects.py
@@ -15,13 +15,6 @@
 from .import_utils import DummyObject, requires_backends
 
 
-class inference_mode(metaclass=DummyObject):
-    _backends = ["ipex"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["ipex"])
-
-
 class IPEXModel(metaclass=DummyObject):
     _backends = ["ipex"]
 
diff --git a/tests/ipex/test_inference.py b/tests/ipex/test_inference.py
deleted file mode 100644
index 1a452fe408..0000000000
--- a/tests/ipex/test_inference.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import unittest
-
-import torch
-from parameterized import parameterized
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForQuestionAnswering,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoTokenizer,
-    pipeline,
-)
-from utils_tests import MODEL_NAMES
-
-from optimum.intel import inference_mode as ipex_inference_mode
-from optimum.intel.ipex.modeling_base import IPEXModel
-
-
-_CLASSIFICATION_TASK_TO_AUTOMODELS = {
-    "text-classification": AutoModelForSequenceClassification,
-    "token-classification": AutoModelForTokenClassification,
-}
-
-
-class IPEXClassificationTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = (
-        "bert",
-        "distilbert",
-        "roberta",
-    )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_pipeline_inference(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = "This is a sample input"
-        for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items():
-            model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32)
-            pipe = pipeline(task, model=model, tokenizer=tokenizer)
-
-            with torch.inference_mode():
-                outputs = pipe(inputs)
-            with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe:
-                outputs_ipex = ipex_pipe(inputs)
-            self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule))
-            self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"])
-
-
-class IPEXQuestionAnsweringTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = (
-        "bert",
-        "distilbert",
-        "roberta",
-    )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_pipeline_inference(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        model = AutoModelForQuestionAnswering.from_pretrained(model_id, torch_dtype=torch.float32)
-        pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
-
-        with torch.inference_mode():
-            outputs = pipe(question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris.")
-        with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe:
-            outputs_ipex = ipex_pipe(
-                question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris."
-            )
-        self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule))
-        self.assertEqual(outputs["start"], outputs_ipex["start"])
-        self.assertEqual(outputs["end"], outputs_ipex["end"])
-
-
-class IPEXTextGenerationTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = (
-        "bloom",
-        "gptj",
-        "gpt2",
-        "gpt_neo",
-        "gpt_bigcode",
-        "llama",
-        "llama2",
-        "opt",
-        "mpt",
-    )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_pipeline_inference(self, model_arch):
-        model_id = MODEL_NAMES[model_arch]
-        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, return_dict=False)
-        model = model.eval()
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        inputs = "This is a simple input"
-        text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-        with torch.inference_mode():
-            output = text_generator(inputs)
-        with ipex_inference_mode(
-            text_generator, dtype=model.config.torch_dtype, verbose=False, jit=True
-        ) as ipex_text_generator:
-            output_ipex = ipex_text_generator(inputs)
-        self.assertTrue(isinstance(ipex_text_generator.model._optimized, IPEXModel))
-        self.assertTrue(isinstance(ipex_text_generator.model._optimized.model, torch.jit.RecursiveScriptModule))
-        self.assertEqual(output[0]["generated_text"], output_ipex[0]["generated_text"])

From 2696e6fe4729424bc4f85a508b48c69217e3eb17 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Tue, 3 Sep 2024 13:56:06 +0400
Subject: [PATCH 7/9] Add export config for gemma2 (#876)

* add export config for gemma2

* update cache position and tests

* update model list

* fix without cache export

* patch original torch gemma2 to work with dynamic cache

* Update tests/openvino/test_modeling.py

* prevent usage cache implementation

* add min transformers version
---
 docs/source/openvino/models.mdx             |  1 +
 optimum/exporters/openvino/model_configs.py | 21 ++++++++
 optimum/exporters/openvino/model_patcher.py | 60 ++++++++++++++++++++-
 optimum/intel/openvino/modeling_decoder.py  |  2 +
 tests/openvino/test_modeling.py             | 33 ++++++++++--
 tests/openvino/utils_tests.py               |  1 +
 6 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index b82e68fe4e..07da423f8d 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -55,6 +55,7 @@ Here is the list of the supported architectures :
 - GPT-NeoX
 - GPT-NeoX-Japanese
 - Gemma
+- Gemma2
 - Hubert
 - IBert
 - InternLM
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
index 0ad38927a2..ca62b82de6 100644
--- a/optimum/exporters/openvino/model_configs.py
+++ b/optimum/exporters/openvino/model_configs.py
@@ -54,6 +54,7 @@
     CodeGenModelPatcher,
     DBRXModelPatcher,
     FalconModelPatcher,
+    Gemma2ModelPatcher,
     GptNeoxJapaneseModelPatcher,
     GptNeoxModelPatcher,
     InternLM2Patcher,
@@ -997,3 +998,23 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "gemma2",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class Gemma2OpenVINOConfig(GemmaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.43.0")
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index 8cb745bd72..59d4bedb51 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import functools
 import inspect
 import logging as log
 import math
@@ -23,7 +24,7 @@
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.utils import is_tf_available
 
-from optimum.exporters.onnx.model_patcher import DecoderModelPatcher
+from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, override_arguments
 from optimum.intel.utils.import_utils import (
     _openvino_version,
     _torch_version,
@@ -2409,3 +2410,60 @@ def __enter__(self):
         super().__enter__()
         for layer in self._model.gpt_neox_japanese.layers:
             _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb)
+
+
+class Gemma2ModelPatcher(LlamaModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        @functools.wraps(self.orig_forward)
+        def patched_forward(*args, **kwargs):
+            from transformers.cache_utils import DynamicCache
+
+            signature = inspect.signature(self.orig_forward)
+            args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs)
+
+            return_legacy_cache = False
+            pkv_in_args = False
+            legacy_pkv = None
+            if "past_key_values" in kwargs:
+                legacy_pkv = kwargs.pop("past_key_values", None)
+            sign_names = list(signature.parameters.keys())
+            pkv_argument_index = sign_names.index("past_key_values")
+            cache_position_index = sign_names.index("cache_position") if "cache_position" in sign_names else -1
+            input_ids_index = sign_names.index("input_ids" if "input_ids" in sign_names else "inputs_embeds")
+            if legacy_pkv is None and len(args) > pkv_argument_index:
+                legacy_pkv = args[pkv_argument_index]
+                pkv_in_args = True
+            if legacy_pkv is not None:
+                pkv = DynamicCache.from_legacy_cache(legacy_pkv)
+                return_legacy_cache = True
+                if not pkv_in_args:
+                    kwargs["past_key_values"] = pkv
+                else:
+                    args[pkv_argument_index] = pkv
+
+            if (
+                return_legacy_cache
+                and cache_position_index != -1
+                and (cache_position_index > len(args) and "cache_position" not in kwargs)
+            ):
+                past_seen_tokens = legacy_pkv[0][0].shape[-2]
+                input_ids = args[input_ids_index]
+                cache_position = torch.arange(
+                    past_seen_tokens, past_seen_tokens + input_ids.shape[1], device=input_ids.device
+                )
+                kwargs["cache_position"] = cache_position
+
+            outputs = self.orig_forward(*args, **kwargs)
+            if return_legacy_cache:
+                outputs.past_key_values = outputs.past_key_values.to_legacy_cache()
+
+            return outputs
+
+        self.patched_forward = patched_forward
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 554fdee7cd..de094508c9 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -806,6 +806,8 @@ def _from_pretrained(
                 force_download=force_download,
                 local_files_only=local_files_only,
             )
+            if getattr(generation_config, "cache_implementation", None) is not None:
+                generation_config.cache_implementation = None
             kwargs["generation_config"] = generation_config
         except Exception:
             pass
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index d71bbea453..6f24ea0de5 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -57,6 +57,7 @@
 from transformers.testing_utils import slow
 from utils_tests import MODEL_NAMES
 
+from optimum.exporters.openvino.model_patcher import patch_update_causal_mask
 from optimum.intel import (
     OVModelForAudioClassification,
     OVModelForAudioFrameClassification,
@@ -647,6 +648,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
     if is_transformers_version(">=", "4.40.0"):
         SUPPORTED_ARCHITECTURES += (
             "gemma",
+            "gemma2",
             "olmo",
             "stablelm",
             "starcoder2",
@@ -728,7 +730,8 @@ def test_compare_to_transformers(self, model_arch):
         self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4))
 
         # Qwen tokenizer does not support padding
-        if model_arch == "qwen":
+
+        if model_arch in ["qwen"]:
             return
 
         if model_arch not in ["chatglm", "glm4", "persimmon"]:
@@ -753,7 +756,16 @@ def test_compare_to_transformers(self, model_arch):
         )
 
         ov_outputs = ov_model.generate(**tokens, generation_config=gen_config)
-        transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+        additional_inputs = {}
+        # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache,
+        # align cache representation in torch model
+        if model_arch == "gemma2":
+            patch_update_causal_mask(transformers_model, "4.43.0")
+            transformers_model._supports_cache_class = True
+            from transformers.cache_utils import DynamicCache
+
+            additional_inputs = {"past_key_values": DynamicCache()}
+        transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs)
         self.assertTrue(torch.allclose(ov_outputs, transformers_outputs))
 
         del transformers_model
@@ -921,8 +933,8 @@ def test_beam_search(self, model_arch):
                 "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True),
                 "trust_remote_code": True,
             }
-        # Qwen tokenizer does not support padding, chatgm testing model produces nan that incompatible with beam search
-        if model_arch in ["qwen", "chatglm"]:
+        # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search
+        if model_arch in ["qwen", "chatglm", "glm4"]:
             return
 
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
@@ -988,6 +1000,12 @@ def test_beam_search(self, model_arch):
 
         if model_arch == "arctic":
             transformers_model.to(torch.float32)
+        additional_inputs = {}
+        # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, align cache representation in torch model
+        if model_arch == "gemma2":
+            patch_update_causal_mask(transformers_model, "4.43.0")
+            transformers_model._supports_cache_class = True
+            from transformers.cache_utils import DynamicCache
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
         tokens.pop("token_type_ids", None)
@@ -1002,7 +1020,12 @@ def test_beam_search(self, model_arch):
             if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]:
                 continue
             set_seed(SEED)
-            transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+
+            if model_arch == "gemma2":
+                additional_inputs = {"past_key_values": DynamicCache()}
+            transformers_outputs = transformers_model.generate(
+                **tokens, generation_config=gen_config, **additional_inputs
+            )
             set_seed(SEED)
             ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config)
             self.assertTrue(
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 86a0a51e80..869d5897e6 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -54,6 +54,7 @@
     "electra": "hf-internal-testing/tiny-random-electra",
     "exaone": "katuni4ka/tiny-random-exaone",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
+    "gemma2": "katuni4ka/tiny-random-gemma2",
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "falcon-40b": "katuni4ka/tiny-random-falcon-40b",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",

From b51ca3f66b3f7322749884288b69187adba007fd Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:59:52 +0200
Subject: [PATCH 8/9] Transformers 4.44 support (#877)

* test

* fix ipex bloom

* fix bloom

* style

* fix

* use bloom specific modeling when export version is lower than 4.44

* fix
---
 .github/workflows/test_ipex.yml            |  2 +-
 .github/workflows/test_openvino.yml        |  2 +-
 .github/workflows/test_openvino_basic.yml  |  2 +-
 optimum/exporters/ipex/model_patcher.py    |  2 +-
 optimum/exporters/openvino/stateful.py     |  6 +++--
 optimum/intel/ipex/modeling_base.py        |  2 +-
 optimum/intel/openvino/modeling_base.py    |  2 +-
 optimum/intel/openvino/modeling_decoder.py | 27 +++++++++++++++-------
 setup.py                                   |  6 ++---
 9 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 8b97bdd535..8cdfe30b58 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -22,7 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.9]
-        transformers-version: ["4.39.0", "4.43.*"]
+        transformers-version: ["4.39.0", "4.44.*"]
         ipex-version: ["2.2.0", "2.3.*"]
         include:
           - python-version: 3.8
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index 2262407898..335acf669b 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -21,7 +21,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8", "3.12"]
-        transformers-version: ["4.36.0", "4.43.*"]
+        transformers-version: ["4.36.0", "4.44.*"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
index 28c8369c75..ced98dd9ab 100644
--- a/.github/workflows/test_openvino_basic.yml
+++ b/.github/workflows/test_openvino_basic.yml
@@ -24,7 +24,7 @@ jobs:
         # This also ensures that the test fails if dependencies break for Python 3.7
         python-version: ["3.8", "3.12"]
         os: ["ubuntu-22.04", "windows-latest"]
-        transformers-version: ["4.43.*"]
+        transformers-version: ["4.44.*"]
         include:
           - python-version: "3.12"
             os: "ubuntu-22.04"
diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index 216c1c3918..484fd38077 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -40,7 +40,7 @@
 
 # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
 _TRANSFORMERS_MIN_VERSION = "4.39.0"
-_TRANSFORMERS_MAX_VERSION = "4.43.99"
+_TRANSFORMERS_MAX_VERSION = "4.44.99"
 
 _IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
 
diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py
index d8132107a1..a0c82108f1 100644
--- a/optimum/exporters/openvino/stateful.py
+++ b/optimum/exporters/openvino/stateful.py
@@ -21,7 +21,7 @@
 import openvino as ov
 from openvino.runtime import opset13
 from optimum.exporters import TasksManager
-from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version
+from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version, is_transformers_version
 
 
 def model_has_state(ov_model: ov.Model):
@@ -216,7 +216,9 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model):
     batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0
 
     fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim)
-    num_attention_heads = config.num_attention_heads if config.model_type == "bloom" else 1
+    num_attention_heads = (
+        config.num_attention_heads if (config.model_type == "bloom" and is_transformers_version("<", "4.44")) else 1
+    )
     make_stateful(
         ov_model, not_kv_inputs, key_value_input_names, key_value_output_names, batch_dim, num_attention_heads, None
     )
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 568e5be629..d6467f76a2 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -564,7 +564,7 @@ def _prepare_past_key_values(self, input_ids):
                 ]
             )
             return past_key_values
-        elif model_type == "bloom":
+        elif model_type == "bloom" and is_transformers_version("<", "4.44"):
             shape_key = (batch_size * num_attention_heads, d_k, 0)
             shape_value = (batch_size * num_attention_heads, 0, d_k)
             key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device)
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 90c43b7805..98fec17351 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -140,7 +140,7 @@ def dtype(self) -> Optional[torch.dtype]:
     def load_model(
         file_name: Union[str, Path],
         quantization_config: Union[OVWeightQuantizationConfig, Dict] = None,
-    ):
+    ) -> openvino.runtime.Model:
         """
         Loads the model.
 
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index de094508c9..23117e9361 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -25,6 +25,7 @@
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from openvino.preprocess import PrePostProcessor
 from openvino.runtime import Core, Tensor, Type
+from packaging.version import Version
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.generation import GenerationMixin
@@ -38,7 +39,7 @@
 
 from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful
 from ...exporters.openvino.stateful import model_has_state
-from ..utils.import_utils import is_nncf_available, is_transformers_version
+from ..utils.import_utils import compare_versions, is_nncf_available, is_transformers_version
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
 from .configuration import (
     OVConfig,
@@ -51,8 +52,8 @@
 
 
 if TYPE_CHECKING:
+    from transformers.generation.streamers import BaseStreamer
     from transformers.modeling_utils import PreTrainedModel
-    from transformers.streamers import BaseStreamer
 
 
 logger = logging.getLogger(__name__)
@@ -404,7 +405,10 @@ def prepare_inputs(
         **kwargs,
     ) -> Dict:
         batch_size = input_ids.shape[0]
-        if self.config.model_type == "bloom":
+        model_transformers_version = Version(
+            self.model.rt_info["optimum"]["transformers_version"].value if "optimum" in self.model.rt_info else "0.0.0"
+        )
+        if self.config.model_type == "bloom" and compare_versions(model_transformers_version, "<", "4.44"):
             batch_size *= self.config.num_attention_heads
 
         inputs = {}
@@ -619,7 +623,10 @@ def _deduplicate_inputs(self, model_inputs: Dict):
                     shape = input_tensor.shape if isinstance(input_tensor, Tensor) else list(input_tensor.shape)
                     dtype = input_tensor.element_type if isinstance(input_tensor, Tensor) else Type(input_tensor.dtype)
                     upd_batch_size = indicies.shape[0]
-                    if self.config.model_type == "bloom":
+                    export_transformers_version = Version(self.model.rt_info["optimum"]["transformers_version"].value)
+                    if self.config.model_type == "bloom" and compare_versions(
+                        export_transformers_version, "<", "4.44"
+                    ):
                         upd_batch_size *= self.config.num_attention_heads
                     shape[
                         (
@@ -631,10 +638,11 @@ def _deduplicate_inputs(self, model_inputs: Dict):
                     upd_model_inputs[input_name] = Tensor(dtype, shape)
         upd_model_inputs["input_ids"] = unique_input_ids
         if "beam_idx" in model_inputs:
+            export_transformers_version = Version(self.model.rt_info["optimum"]["transformers_version"].value)
             beam_range = (
-                unique_input_ids.shape[0]
-                if self.config.model_type != "bloom"
-                else unique_input_ids.shape[0] * self.config.num_attention_heads
+                unique_input_ids.shape[0] * self.config.num_attention_heads
+                if (self.config.model_type == "bloom" and compare_versions(export_transformers_version, "<", "4.44"))
+                else unique_input_ids.shape[0]
             )
             beam_idx = np.arange(beam_range, dtype=int)
             upd_model_inputs["beam_idx"] = beam_idx
@@ -781,7 +789,10 @@ def _from_pretrained(
         model = cls.load_model(model_cache_path)
 
         model_type = config.model_type.replace("_", "-")
-        if model_type == "bloom":
+        export_transformers_version = Version(
+            model.rt_info["optimum"]["transformers_version"].value if "optimum" in model.rt_info else "0.0.0"
+        )
+        if model_type == "bloom" and compare_versions(export_transformers_version, "<", "4.44"):
             init_cls = OVBloomForCausalLM
         elif model_type == "gpt-bigcode":
             init_cls = OVGPTBigCodeForCausalLM
diff --git a/setup.py b/setup.py
index e637f49e18..cd488f8301 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36.0,<4.44.0",
+    "transformers>=4.36,<4.45",
     "optimum@git+https://github.com/huggingface/optimum.git",
     "datasets>=1.4.0",
     "sentencepiece",
@@ -59,10 +59,10 @@
 QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
 
 EXTRAS_REQUIRE = {
-    "neural-compressor": ["neural-compressor>=2.2.0,<3.0", "accelerate", "transformers<4.43.0"],
+    "neural-compressor": ["neural-compressor>=2.2.0,<3.0", "accelerate", "transformers<4.43"],
     "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.11.0"],
-    "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"],
+    "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,

From 40194a016723acc99b73310cc5320c346f15f691 Mon Sep 17 00:00:00 2001
From: Nikita Savelyev <nikita.savelyev@intel.com>
Date: Wed, 4 Sep 2024 10:01:45 +0200
Subject: [PATCH 9/9] Deprecate  export parameters (#886)

---
 optimum/exporters/openvino/__main__.py | 38 --------------------------
 tests/openvino/test_export.py          |  3 --
 tests/openvino/test_exporters_cli.py   |  6 +---
 3 files changed, 1 insertion(+), 46 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index c4b6ef0cd8..842198625d 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -49,15 +49,6 @@
     import torch
 
 
-_COMPRESSION_OPTIONS = {
-    "int8": {"bits": 8},
-    "int4_sym_g128": {"bits": 4, "sym": True, "group_size": 128},
-    "int4_asym_g128": {"bits": 4, "sym": False, "group_size": 128},
-    "int4_sym_g64": {"bits": 4, "sym": True, "group_size": 64},
-    "int4_asym_g64": {"bits": 4, "sym": False, "group_size": 64},
-}
-
-
 logger = logging.getLogger(__name__)
 
 
@@ -108,8 +99,6 @@ def main_export(
     model_kwargs: Optional[Dict[str, Any]] = None,
     custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
-    compression_option: Optional[str] = None,
-    compression_ratio: Optional[float] = None,
     ov_config: "OVConfig" = None,
     stateful: bool = True,
     convert_tokenizer: bool = False,
@@ -171,11 +160,6 @@ def main_export(
         fn_get_submodels (`Optional[Callable]`, defaults to `None`):
             Experimental usage: Override the default submodels that are used at the export. This is
             especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
-        compression_option (`Optional[str]`, defaults to `None`):
-            The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
-            `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression.
-        compression_ratio (`Optional[float]`, defaults to `None`):
-            Compression ratio between primary and backup precision (only relevant to INT4).
         stateful (`bool`, defaults to `True`):
             Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.
         **kwargs_shapes (`Dict`):
@@ -198,28 +182,6 @@ def main_export(
             raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
         token = use_auth_token
 
-    if compression_option is not None:
-        logger.warning(
-            "The `compression_option` argument is deprecated and will be removed in optimum-intel v1.17.0. "
-            "Please, pass an `ov_config` argument instead `OVConfig(..., quantization_config=quantization_config)`."
-        )
-
-    if compression_ratio is not None:
-        logger.warning(
-            "The `compression_ratio` argument is deprecated and will be removed in optimum-intel v1.17.0. "
-            "Please, pass an `ov_config` argument instead `OVConfig(quantization_config={ratio=compression_ratio})`."
-        )
-
-    if ov_config is None and compression_option is not None:
-        from ...intel.openvino.configuration import OVConfig
-
-        if compression_option == "fp16":
-            ov_config = OVConfig(dtype="fp16")
-        elif compression_option != "fp32":
-            q_config = _COMPRESSION_OPTIONS[compression_option] if compression_option in _COMPRESSION_OPTIONS else {}
-            q_config["ratio"] = compression_ratio or 1.0
-            ov_config = OVConfig(quantization_config=q_config)
-
     original_task = task
     task = infer_task(
         task, model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py
index ef20ed5a2d..d48e86fe27 100644
--- a/tests/openvino/test_export.py
+++ b/tests/openvino/test_export.py
@@ -16,7 +16,6 @@
 import unittest
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Optional
 
 import torch
 from parameterized import parameterized
@@ -76,7 +75,6 @@ class ExportModelTest(unittest.TestCase):
     def _openvino_export(
         self,
         model_type: str,
-        compression_option: Optional[str] = None,
         stateful: bool = True,
         patch_16bit_model: bool = False,
     ):
@@ -106,7 +104,6 @@ def _openvino_export(
                     output=Path(tmpdirname),
                     task=supported_task,
                     preprocessors=preprocessors,
-                    compression_option=compression_option,
                     stateful=stateful,
                 )
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 6380a52881..9da496ae05 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -108,16 +108,12 @@ class OVCLIExportTestCase(unittest.TestCase):
         ),
     ]
 
-    def _openvino_export(
-        self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None
-    ):
+    def _openvino_export(self, model_name: str, task: str):
         with TemporaryDirectory() as tmpdir:
             main_export(
                 model_name_or_path=model_name,
                 output=tmpdir,
                 task=task,
-                compression_option=compression_option,
-                compression_ratio=compression_ratio,
             )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)