From 9a18ae0119ee5e6669b42f49a6fe7ac3397ac55b Mon Sep 17 00:00:00 2001 From: rbrugaro Date: Tue, 27 Aug 2024 09:19:47 -0700 Subject: [PATCH 1/9] set cpu affinity and membind for better oob performance (#853) * set num threads and memory binding for better OOB performance * clean env var * added core and memory binding util for improved performance * add example usage in docstring * change utlity for best oob to support world_size and rank >=1 * fix style * fix node_id value to account for rank_id starts at zero * numa node assignment calculated from local size not from world size * reorg imports, moved checks to import_utils, remove prints for logger * raise Errors with missing pkg and unsupported OS * added missng env var to list * Update optimum/intel/utils/modeling_utils.py * Update optimum/intel/utils/import_utils.py * Update optimum/intel/utils/import_utils.py * fix style quality error --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- docker/Dockerfile.intel | 9 +-- optimum/intel/utils/__init__.py | 1 + optimum/intel/utils/import_utils.py | 12 ++++ optimum/intel/utils/modeling_utils.py | 82 +++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel index 60fd51b424..a7f1dc978f 100644 --- a/docker/Dockerfile.intel +++ b/docker/Dockerfile.intel @@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ libpng-dev \ python3 \ python3-pip \ + python3-dev \ + libnuma-dev \ && rm -rf /var/lib/apt/lists/*" RUN /usr/sbin/update-ccache-symlinks RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache @@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \ torchaudio==${TORCHAUDIO_VERSION} \ -f https://download.pytorch.org/whl/torch_stable.html && \ python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \ - python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \ + python3 -m pip install --no-cache-dir numa -ARG OMP_NUM_THREADS=1 -ENV OMP_NUM_THREADS=${OMP_NUM_THREADS} ARG KMP_BLOCKTIME=1 ENV KMP_BLOCKTIME=${KMP_BLOCKTIME} ARG KMP_HW_SUBSET=1T ENV KMP_HW_SUBSET=${KMP_HW_SUBSET} -ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so" \ No newline at end of file +ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so" diff --git a/optimum/intel/utils/__init__.py b/optimum/intel/utils/__init__.py index d77588f896..50cdfa143e 100644 --- a/optimum/intel/utils/__init__.py +++ b/optimum/intel/utils/__init__.py @@ -22,6 +22,7 @@ is_neural_compressor_available, is_neural_compressor_version, is_nncf_available, + is_numa_available, is_openvino_available, is_torch_version, is_transformers_available, diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 6be0aac47a..032280e940 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -150,6 +150,14 @@ except importlib_metadata.PackageNotFoundError: _accelerate_available = False +_numa_available = importlib.util.find_spec("numa") is not None + +if _numa_available: + try: + importlib_metadata.version("numa") + except importlib_metadata.PackageNotFoundError: + _numa_available = False + def is_transformers_available(): return _transformers_available @@ -272,6 +280,10 @@ def is_accelerate_available(): return _accelerate_available +def is_numa_available(): + return _numa_available + + # This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319 def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str): """ diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index cd5b34f86f..1d2f7b03c5 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -12,16 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging +import math +import os +import platform import re from pathlib import Path from typing import List, Optional, Union +import psutil import torch from huggingface_hub import HfApi, HfFolder +from .import_utils import is_numa_available + MULTI_QUERY_ATTN_MODELS = {"gpt_bigcode"} +logger = logging.getLogger(__name__) + def get_model_device(model: torch.nn.Module) -> torch.device: """ @@ -135,3 +144,76 @@ def replace_customized_linear_with_linear(model): setattr(model, child_name, new_m) else: replace_customized_linear_with_linear(child) + + +def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" + for e in env_keys: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +def bind_cores_for_best_perf(): + """ + Set number of threads per rank, numa cpu affinity and numa memory binding if not already set for better OOB performance. + Works for wold_size >= 1 and rank >= 1 + + Example: + .. code-block:: python + + from optimum.intel.ipex import IPEXModelForCausalLM + from optimum.intel.utils.modeling_utils import bind_cores_for_best_perf + + bind_cores_for_best_perf() + model = IPEXModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.bfloat16, export=True) + tokenizer = AutoTokenizer.from_pretrained("gpt2") + input_sentence = ["tell me a story about a trip to the moon"] + model_inputs = tokenizer(input_sentence, return_tensors="pt") + generation_kwargs = dict(max_new_tokens=500) + generated_ids = model.generate(**model_inputs, **generation_kwargs) + + Returns: + None + + """ + if platform.system() != "Linux": + logger.error("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.") + raise OSError("bind_cores_for_best_perf: OS not supported, this function can only be run on Linux systems.") + if not is_numa_available(): + logger.error("'numa' module not found") + raise ImportError("'numa' module not found, install with 'pip install numa'") + import numa + + local_size = get_int_from_env( + ["LOCAL_WORLD_SIZE", "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1 + ) + rank_id = get_int_from_env( + ["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0 + ) + nodes = numa.get_max_node() + 1 + rank_per_node = math.ceil(local_size / nodes) + num_cpus_per_nodes = int(psutil.cpu_count(logical=False) / nodes) + node_id = int(rank_id / rank_per_node) + rank_offset_per_node = rank_id % rank_per_node + if os.getenv("OMP_NUM_THREADS") is None: + num_cpus_per_rank = max(int(num_cpus_per_nodes / rank_per_node), 1) + logger.info(f"Setting OMP_NUM_THREADS to {num_cpus_per_rank} for better performance") + else: + num_cpus_per_rank = int(os.getenv("OMP_NUM_THREADS")) + logger.info(f"OMP_NUM_THREADS already set to {num_cpus_per_rank}") + if len(numa.get_membind()) == nodes: + # if numa memory binding is not set, set it to the node where the rank is running + numa.set_membind([node_id]) + + torch.set_num_threads(num_cpus_per_rank) + + if len(numa.get_affinity(0)) == psutil.cpu_count(logical=True): + # if numa affinity is unset (default value is set to all logical cores) set it to the physical cores assigned to the rank + cpu_start = num_cpus_per_rank * rank_offset_per_node + numa.set_affinity( + 0, + list(numa.node_to_cpus(node_id))[cpu_start : cpu_start + num_cpus_per_rank], + ) + logger.info(f"affinity={numa.get_affinity(0)}, membind = {numa.get_membind()}") From af8c28d46e2e3b589170866502093c5af34b749c Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 30 Aug 2024 15:43:45 +0400 Subject: [PATCH 2/9] Fix openvino nightly install in tests (#885) * fix openvino nightly install in tests * Update .github/workflows/test_openvino.yml --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- .github/workflows/test_openvino.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 13a6b83e57..2262407898 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -51,7 +51,6 @@ jobs: pytest tests/openvino/test_modeling_basic.py - name: Test openvino-nightly run: | - pip uninstall -y openvino - pip install openvino-nightly + pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov From d6e6e1f0350ef0b66dab5266196d56f3a5dd4c7c Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 30 Aug 2024 17:06:07 +0400 Subject: [PATCH 3/9] Fix attention mask for glm4 (#884) --- optimum/exporters/openvino/model_patcher.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 6e65f4f11a..8cb745bd72 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -308,10 +308,9 @@ def _chatglm2_core_attention_forward(self, query_layer, key_layer, value_layer, def _glm4_core_attention_forward(self, query_layer, key_layer, value_layer, attention_mask): - attention_mask = ~attention_mask - context_layer = torch.nn.functional.scaled_dot_product_attention( - query_layer, key_layer, value_layer, attention_mask.to(torch.float32) - ) + causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32) + causal_mask.masked_fill_(attention_mask, float("-inf")) + context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, causal_mask) context_layer = context_layer.transpose(1, 2).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) context_layer = context_layer.reshape(*new_context_layer_shape) From b5998f2f44e581b102ed7a9b714ac0f7c2d51a66 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Fri, 30 Aug 2024 16:11:11 +0200 Subject: [PATCH 4/9] Apply weight compression after model save to reduce peak RAM during export (#878) * Initial commit * Style * Adopt tests * Add no-nncf warning * Apply suggested changes * Do not save in fp16 in case of weight compression * Replace model files right away --- optimum/exporters/openvino/__main__.py | 50 ++++++++- optimum/exporters/openvino/convert.py | 38 +------ tests/openvino/test_quantization.py | 138 +++++++++++++------------ 3 files changed, 123 insertions(+), 103 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 77f8049606..c4b6ef0cd8 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -14,7 +14,9 @@ import gc import logging +import operator import warnings +from functools import reduce from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union @@ -23,18 +25,20 @@ from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase from transformers.utils import is_torch_available +from openvino.runtime import Core, Type, save_model from optimum.exporters import TasksManager from optimum.exporters.onnx.base import OnnxConfig from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.openvino.convert import export_from_model from optimum.intel.utils.import_utils import ( + is_nncf_available, is_openvino_tokenizers_available, is_openvino_version, is_transformers_version, ) from optimum.utils.save_utils import maybe_load_preprocessors -from .utils import clear_class_registry +from .utils import _MAX_UNCOMPRESSED_SIZE, clear_class_registry if TYPE_CHECKING: @@ -402,7 +406,7 @@ class StoreAttr(object): model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code ) - export_from_model( + submodel_paths = export_from_model( model=model, output=output, task=task, @@ -425,6 +429,48 @@ class StoreAttr(object): del model gc.collect() + core = Core() + for submodel_path in submodel_paths: + submodel_path = Path(output) / submodel_path + submodel = core.read_model(submodel_path) + + quantization_config = None + if ov_config is None: + num_parameters = 0 + for op in submodel.get_ops(): + if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]: + num_parameters += reduce(operator.mul, op.shape, 1) + if num_parameters >= _MAX_UNCOMPRESSED_SIZE: + if is_nncf_available(): + quantization_config = {"bits": 8, "sym": False} + logger.info("The model weights will be quantized to int8_asym.") + else: + logger.warning( + "The model will be converted with no weights quantization. Quantization of the weights to int8 " + "requires nncf. Please install it with `pip install nncf`" + ) + break + else: + quantization_config = ov_config.quantization_config + if quantization_config is None: + continue + + if not is_nncf_available(): + raise ImportError("Quantization of the weights requires nncf, please install it with `pip install nncf`") + + from optimum.intel.openvino.quantization import _weight_only_quantization + + _weight_only_quantization(submodel, quantization_config) + + compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" + save_model(submodel, compressed_submodel_path, compress_to_fp16=False) + del submodel + + submodel_path.unlink() + submodel_path.with_suffix(".bin").unlink() + compressed_submodel_path.rename(submodel_path) + compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin")) + # Unpatch modules after GPTQ export if do_gptq_patching: torch.cuda.is_available = orig_cuda_check diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 0b937734ce..dc2af68784 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -49,7 +49,6 @@ from .model_patcher import patch_model_with_bettertransformer from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful from .utils import ( - _MAX_UNCOMPRESSED_SIZE, OV_XML_FILE_NAME, clear_class_registry, flattenize_inputs, @@ -76,21 +75,7 @@ def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None): - compress_to_fp16 = False - - if ov_config is not None: - if ov_config.quantization_config: - if not is_nncf_available(): - raise ImportError( - "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`" - ) - - from optimum.intel.openvino.quantization import _weight_only_quantization - - _weight_only_quantization(model, ov_config.quantization_config) - - compress_to_fp16 = ov_config.dtype == "fp16" - + compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16" model = _add_version_info_to_model(model, library_name) save_model(model, path, compress_to_fp16) @@ -643,25 +628,6 @@ def export_from_model( ) logging.disable(logging.NOTSET) - if ov_config is None: - if library_name == "diffusers": - num_parameters = model.unet.num_parameters() - else: - num_parameters = sum(param.numel() for param in list(model.parameters()) if param.requires_grad) - - if num_parameters >= _MAX_UNCOMPRESSED_SIZE: - if is_nncf_available(): - from ...intel.openvino.configuration import OVConfig - - ov_config = OVConfig(quantization_config={"bits": 8, "sym": False}) - - logger.info("The model weights will be quantized to int8_asym.") - else: - logger.warning( - "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf." - "please install it with `pip install nncf`" - ) - if library_name != "diffusers": # Saving the model config and preprocessor as this is needed sometimes. model.config.save_pretrained(output) @@ -720,6 +686,8 @@ def export_from_model( patch_16bit_model=patch_16bit_model, ) + return files_subpaths + def export_tokenizer( tokenizer, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 23ff3a03ca..5835bc76a2 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import inspect # ruff: noqa @@ -22,6 +23,7 @@ from enum import Enum from functools import partial from typing import Union + import pytest import evaluate import numpy as np @@ -538,76 +540,80 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): self.assertEqual(0, num_int8) def test_ovmodel_load_large_model_with_default_compressed_weights(self): - with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: - mock_tensor = unittest.mock.Mock() - mock_tensor.numel = lambda: 2000000000 - mock_tensor.requires_grad = True - model_parameters.return_value = [mock_tensor] - with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: - with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: - _ = OVModelForCausalLM.from_pretrained( - MODEL_NAMES["llama"], export=True, compile=False, use_cache=False - ) - save_model_patch.assert_called_with( - unittest.mock.ANY, - unittest.mock.ANY, - ov_config=OVConfig(quantization_config={"bits": 8}), - library_name="transformers", - ) + def main_export_in_stacktrace(*args, **kwargs): + # Compression was called from `main_export` + self.assertTrue(inspect.stack()[5].function == "main_export") + + with unittest.mock.patch( + "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock + ) as ov_constant_shape: + ov_constant_shape.return_value = (2000000000,) + with unittest.mock.patch( + "nncf.compress_weights", side_effect=main_export_in_stacktrace + ) as compress_weights_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], export=True, compile=False, use_cache=False + ) + compression_params = { + "mode": nncf.CompressWeightsMode.INT8_ASYM, + "ratio": 1.0, + "group_size": -1, + "all_layers": None, + "sensitivity_metric": None, + "dataset": None, + "ignored_scope": nncf.IgnoredScope(), + "awq": None, + "subset_size": 128, + "scale_estimation": None, + } + compress_weights_patch.assert_called_with( + unittest.mock.ANY, + **compression_params, + ) def test_ovmodel_load_large_model_with_uncompressed_weights(self): - with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: - mock_tensor = unittest.mock.Mock() - mock_tensor.numel = lambda: 2000000000 - mock_tensor.requires_grad = True - model_parameters.return_value = [mock_tensor] - with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: - with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: - _ = OVModelForCausalLM.from_pretrained( - MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False - ) - save_model_patch.assert_called_with( - unittest.mock.ANY, - unittest.mock.ANY, - ov_config=OVConfig(dtype="auto"), - library_name="transformers", - ) + with unittest.mock.patch( + "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock + ) as ov_constant_shape: + ov_constant_shape.return_value = (2000000000,) + with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], export=True, load_in_8bit=False, compile=False, use_cache=False + ) + compress_weights_patch.assert_not_called() def test_ovmodel_load_large_model_with_additional_quantization_config(self): - with unittest.mock.patch("torch.nn.Module.parameters") as model_parameters: - mock_tensor = unittest.mock.Mock() - mock_tensor.numel = lambda: 2000000000 - mock_tensor.requires_grad = True - with unittest.mock.patch("openvino.runtime.ie_api.Core.read_model") as core_patch: - with unittest.mock.patch("optimum.exporters.openvino.convert._save_model") as save_model_patch: - with unittest.mock.patch("nncf.compress_weights") as compress_weights_patch: - _ = OVModelForCausalLM.from_pretrained( - MODEL_NAMES["llama"], - export=True, - compile=False, - use_cache=False, - quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), - ) - # quantization will be performed later, using load_model - save_model_patch.assert_called_with( - unittest.mock.ANY, - unittest.mock.ANY, - ov_config=OVConfig(dtype="auto"), - library_name="transformers", - ) - compression_params = { - "mode": nncf.CompressWeightsMode.INT4_SYM, - "ratio": 0.8, - "group_size": -1, - "all_layers": None, - "sensitivity_metric": None, - "dataset": None, - "ignored_scope": nncf.IgnoredScope(), - "awq": None, - "subset_size": 128, - "scale_estimation": None, - } - compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) + def main_export_not_in_stacktrace(*args, **kwargs): + # Compression was not called from `main_export` + self.assertTrue(all(frame_info.function != "main_export" for frame_info in inspect.stack())) + + with unittest.mock.patch( + "openvino.runtime.op.Constant.shape", new_callable=unittest.mock.PropertyMock + ) as ov_constant_shape: + ov_constant_shape.return_value = (2000000000,) + with unittest.mock.patch( + "nncf.compress_weights", side_effect=main_export_not_in_stacktrace + ) as compress_weights_patch: + _ = OVModelForCausalLM.from_pretrained( + MODEL_NAMES["llama"], + export=True, + compile=False, + use_cache=False, + quantization_config=OVWeightQuantizationConfig(bits=4, sym=True, group_size=-1, ratio=0.8), + ) + compression_params = { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "ratio": 0.8, + "group_size": -1, + "all_layers": None, + "sensitivity_metric": None, + "dataset": None, + "ignored_scope": nncf.IgnoredScope(), + "awq": None, + "subset_size": 128, + "scale_estimation": None, + } + compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantization_config, expected_ov_int4): From 9a8782446e394ac07283b8bd8b44916c4f297826 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Mon, 2 Sep 2024 18:23:53 +0200 Subject: [PATCH 5/9] Introduce support for `mxfp4` data type for OV weight compression (#882) * Add support for mxfp4_e2m1 data type for OV weight compression * Add new tests for mxfp4_e2m1. Adopt tests structure for new data type. * Style * Rename dtype to weight_format * Fix descriptions * Replace 'mxfp4_e2m1' with 'mxfp4' * Add checks for mxfp4 weight format * Dataset is possible in case of INT8 HQ * Address comments --- optimum/commands/export/openvino.py | 19 ++--- optimum/intel/openvino/configuration.py | 41 ++++++++- optimum/intel/openvino/quantization.py | 9 +- tests/openvino/test_exporters_cli.py | 37 +++----- tests/openvino/test_quantization.py | 109 +++++++++++++----------- tests/openvino/utils_tests.py | 23 +++-- 6 files changed, 138 insertions(+), 100 deletions(-) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 742612ca35..5f6c209df6 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -70,9 +70,9 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--weight-format", type=str, - choices=["fp32", "fp16", "int8", "int4", "int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"], + choices=["fp32", "fp16", "int8", "int4", "mxfp4"], default=None, - help="he weight format of the exported model.", + help="The weight format of the exported model.", ) optional_group.add_argument( "--library", @@ -255,12 +255,11 @@ def run(self): elif self.args.weight_format in {"fp16", "fp32"}: ov_config = OVConfig(dtype=self.args.weight_format) else: - is_int8 = self.args.weight_format == "int8" - - # For int4 quantization if no parameter is provided, then use the default config if exist - if no_compression_parameter_provided(self.args) and not is_int8: + # For int4 quantization if no parameter is provided, then use the default config if exists + if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4": quantization_config = get_default_int4_config(self.args.model) else: + is_int8 = self.args.weight_format == "int8" quantization_config = { "bits": 8 if is_int8 else 4, "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]), @@ -272,17 +271,11 @@ def run(self): "quant_method": "awq" if self.args.awq else "default", "sensitivity_metric": self.args.sensitivity_metric, "scale_estimation": self.args.scale_estimation, + "weight_format": self.args.weight_format, } if quantization_config.get("dataset", None) is not None: quantization_config["trust_remote_code"] = self.args.trust_remote_code - - if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}: - logger.warning( - f"--weight-format {self.args.weight_format} is deprecated, possible choices are fp32, fp16, int8, int4" - ) - quantization_config["sym"] = "asym" not in self.args.weight_format - quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64 ov_config = OVConfig(quantization_config=quantization_config) quantization_config = ov_config.quantization_config if ov_config else None diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index aaaca031b2..ed9638e18c 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -312,6 +312,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): scale_estimation (`bool`, *optional*): Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and compressed layers. Providing a dataset is required to run scale estimation. + weight_format (`str`, defaults to 'int'): + Data format weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4']. """ def __init__( @@ -329,6 +331,7 @@ def __init__( num_samples: Optional[int] = None, quant_method: Union[str, OVQuantizationMethod] = OVQuantizationMethod.DEFAULT, scale_estimation: bool = None, + weight_format: Optional[str] = None, **kwargs, ): super().__init__(bits=bits, sym=sym, ignored_scope=ignored_scope, num_samples=num_samples) @@ -341,6 +344,7 @@ def __init__( self.sensitivity_metric = sensitivity_metric self.quant_method = OVQuantizationMethod(quant_method) if isinstance(quant_method, str) else quant_method self.scale_estimation = scale_estimation + self.weight_format = weight_format self.post_init() def post_init(self): @@ -382,10 +386,38 @@ def post_init(self): raise ValueError( f"For 8-bit quantization, `group_size` is expected to be set to -1, but was set to {self.group_size}" ) + if self.all_layers: + raise ValueError("The `all_layers` parameter is not supported for 8-bit quantization") + if self.sensitivity_metric: + raise ValueError("The `sensitivity_metric` parameter is not supported for 8-bit quantization") + if self.quant_method == OVQuantizationMethod.AWQ: + raise ValueError( + "The AWQ algorithm is not supported for 8-bit quantization and got `quant_method='awq'`, please update accordingly" + ) + if self.scale_estimation: + raise ValueError( + "The Scale Estimation algorithm is not supported for 8-bit quantization and got `scale_estimation=True`, please set `scale_estimation=False`" + ) if self.tokenizer is not None and not isinstance(self.tokenizer, str): raise ValueError(f"Tokenizer is expected to be a string, but found {self.tokenizer}") + if self.weight_format is None: + self.weight_format = "int4" if self.bits == 4 else "int8" + if self.weight_format not in ["int4", "int8", "mxfp4"]: + raise ValueError( + f"Weight format must be one of the following: ['int4', 'int8', 'mxfp4'], but found: {self.weight_format}." + ) + if self.weight_format == "mxfp4": + if self.bits != 4: + raise ValueError( + f"When applying weight compression with 'mxfp4' weight format the `bits` parameters must be set to 4, but found {self.bits}" + ) + if self.quant_method == OVQuantizationMethod.AWQ: + raise ValueError("The AWQ algorithm is not supported for 'mxfp4' weight format") + if self.scale_estimation: + raise ValueError("The Scale Estimation algorithm is not supported for 'mxfp4' weight format") + @dataclass class OVDynamicQuantizationConfig(OVWeightQuantizationConfig): @@ -473,8 +505,13 @@ def __init__( self.compression = kwargs.get( "compression", None ) # A field for backward-compatability of training-time compression parameters - bits = self.quantization_config.bits if self.quantization_config else None - self.dtype = "int" + str(bits) if isinstance(bits, int) else dtype + if self.quantization_config is not None: + if isinstance(self.quantization_config, OVWeightQuantizationConfig): + self.dtype = self.quantization_config.weight_format + else: + self.dtype = "int8" + else: + self.dtype = dtype def add_input_info(self, model_inputs: Dict, force_batch_one: bool = False): self.input_info = [ diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index df9d496de7..c858c3a63a 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -811,10 +811,13 @@ def _weight_only_quantization( if isinstance(config.sensitivity_metric, str): sensitivity_metric = getattr(SensitivityMetric, config.sensitivity_metric.upper()) - if config.bits == 8: - mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM + if config.weight_format == "mxfp4": + mode = CompressWeightsMode.E2M1 else: - mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM + if config.bits == 8: + mode = CompressWeightsMode.INT8_SYM if config.sym else CompressWeightsMode.INT8_ASYM + else: + mode = CompressWeightsMode.INT4_SYM if config.sym else CompressWeightsMode.INT4_ASYM return nncf.compress_weights( model, diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index b5aff8d175..6380a52881 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -89,31 +89,22 @@ class OVCLIExportTestCase(unittest.TestCase): ) TEST_4BIT_CONFIGURATONS = [ - ("text-generation-with-past", "opt125m", "int4_sym_g128", 4, 72), - ("text-generation-with-past", "opt125m", "int4_asym_g128", 4, 144), - ("text-generation-with-past", "opt125m", "int4_sym_g64", 4, 72), - ("text-generation-with-past", "opt125m", "int4_asym_g64", 4, 144), - ( - "text-generation-with-past", - "llama_awq", - "int4 --ratio 1.0 --sym --group-size 8 --all-layers", - 0, - 16, - ), + ("text-generation-with-past", "opt125m", "int4 --sym --group-size 128", {"int8": 4, "int4": 72}), + ("text-generation-with-past", "opt125m", "int4 --group-size 64", {"int8": 4, "int4": 144}), + ("text-generation-with-past", "opt125m", "mxfp4", {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}), + ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 8 --all-layers", {"int4": 16}), ( "text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 " "--sensitivity-metric max_activation_variance", - 4, - 14, + {"int8": 4, "int4": 14}, ), ( "text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ", - 4, - 14, + {"int8": 4, "int4": 14}, ), ] @@ -219,8 +210,8 @@ def test_exporters_cli_int8(self, task: str, model_type: str): expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] for i, model in enumerate(models): - _, num_int8, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_int8[i], num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_int8[i], num_weight_nodes["int8"]) @parameterized.expand(SUPPORTED_SD_HYBRID_ARCHITECTURES) def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: int, exp_num_int8: int): @@ -231,12 +222,12 @@ def test_exporters_cli_hybrid_quantization(self, model_type: str, exp_num_fq: in check=True, ) model = eval(_HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")]).from_pretrained(tmpdir) - num_fq, num_int8, _ = get_num_quantized_nodes(model.unet) - self.assertEqual(exp_num_int8, num_int8) + num_fq, num_weight_nodes = get_num_quantized_nodes(model.unet) + self.assertEqual(exp_num_int8, num_weight_nodes["int8"]) self.assertEqual(exp_num_fq, num_fq) @parameterized.expand(TEST_4BIT_CONFIGURATONS) - def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_int8: int, expected_int4: int): + def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expected_num_weight_nodes: dict): with TemporaryDirectory() as tmpdir: result = subprocess.run( f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", @@ -251,9 +242,9 @@ def test_exporters_cli_int4(self, task: str, model_type: str, option: str, expec else _HEAD_TO_AUTOMODELS[model_type.replace("-refiner", "")] ).from_pretrained(tmpdir, **model_kwargs) - _, num_int8, num_int4 = get_num_quantized_nodes(model) - self.assertEqual(expected_int8, num_int8) - self.assertEqual(expected_int4, num_int4) + _, num_weight_nodes = get_num_quantized_nodes(model) + expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) + self.assertEqual(expected_num_weight_nodes, num_weight_nodes) self.assertTrue("--awq" not in option or b"Applying AWQ" in result.stdout) self.assertTrue("--scale-estimation" not in option or b"Applying Scale Estimation" in result.stdout) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 5835bc76a2..c263000f18 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -124,9 +124,9 @@ def preprocess_function(examples, tokenizer): ov_config=ov_config, ) model = model_cls.from_pretrained(tmp_dir, file_name=file_name) - num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_fake_quantize, num_fake_quantize) - self.assertEqual(expected_int8, num_int8) + self.assertEqual(expected_int8, num_weight_nodes["int8"]) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -165,9 +165,9 @@ def preprocess_function(examples, tokenizer): model = model_cls.from_pretrained(tmp_dir) - num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_fake_quantize, num_fake_quantize) - self.assertEqual(expected_int8, num_int8) + self.assertEqual(expected_int8, num_weight_nodes["int8"]) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -189,11 +189,12 @@ class OVWeightCompressionTest(unittest.TestCase): SUPPORTED_ARCHITECTURES_STATEFUL_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "gpt2", 44, 44),) LOAD_IN_4_BITS_SCOPE = ( + (OVModelForCausalLM, "gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8), {"int4": 30, "int8": 14}), ( OVModelForCausalLM, "gpt2", - dict(bits=4, sym=False, group_size=-1, ratio=0.8), - 14, + dict(bits=4, weight_format="mxfp4", group_size=32), + {"f4e2m1": 20, "f8e8m0": 20, "int8": 4}, ), ( OVModelForCausalLM, @@ -204,13 +205,13 @@ class OVWeightCompressionTest(unittest.TestCase): group_size=32, ignored_scope={"names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"]}, ), - 4, + {"int4": 38, "int8": 4}, ), ( OVModelForCausalLM, "gpt2", dict(bits=4, sym=False, group_size=-1, ratio=0.8, all_layers=True), - 18, + {"int4": 26, "int8": 18}, ), ( OVModelForCausalLM, @@ -223,7 +224,7 @@ class OVWeightCompressionTest(unittest.TestCase): sensitivity_metric="mean_activation_magnitude", dataset="c4", ), - 14, + {"int4": 25, "int8": 14}, ), ( OVModelForCausalLM, @@ -236,7 +237,7 @@ class OVWeightCompressionTest(unittest.TestCase): sensitivity_metric="mean_activation_magnitude", dataset=["one two, " * i for i in range(10)], ), - 14, + {"int4": 25, "int8": 14}, ), ( OVModelForCausalLM, @@ -251,7 +252,7 @@ class OVWeightCompressionTest(unittest.TestCase): quant_method=QuantizationMethod.AWQ, scale_estimation=True, ), - 8, + {"int4": 12, "int8": 8}, ), ( OVModelForCausalLM, @@ -265,7 +266,7 @@ class OVWeightCompressionTest(unittest.TestCase): dataset="c4", quant_method="awq", ), - 8, + {"int4": 12, "int8": 8}, ), ) @@ -308,8 +309,8 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) - _, num_int8, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_pt_int8, num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_pt_int8, num_weight_nodes["int8"]) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -339,8 +340,8 @@ def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_p quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) - _, num_int8, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int8, num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -365,9 +366,9 @@ def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_i quantizer.quantize(save_directory=tmp_dir, ov_config=ov_config) model = model_cls.from_pretrained(tmp_dir) - _, num_int8, num_int4 = get_num_quantized_nodes(model) - self.assertEqual(expected_int8, num_int8) - self.assertEqual(expected_int4, num_int4) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_int8, num_weight_nodes["int8"]) + self.assertEqual(expected_int4, num_weight_nodes["int4"]) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -391,8 +392,8 @@ def test_ovmodel_8bit_weight_compression_stateful(self, model_cls, model_name, e quantizer.quantize(save_directory=tmp_dir) model = model_cls.from_pretrained(tmp_dir) - _, num_int8, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int8, num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -418,8 +419,8 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] for i, model in enumerate(models): - _, num_int8, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int8[i], num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_ov_int8[i], num_weight_nodes["int8"]) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION) def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_fake_quantize, expected_ov_int8): @@ -428,10 +429,10 @@ def test_ovmodel_hybrid_quantization(self, model_cls, model_type, expected_num_f with tempfile.TemporaryDirectory() as tmp_dir: model = model_cls.from_pretrained(model_id, export=True, quantization_config=quantization_config) - num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) - self.assertEqual(expected_ov_int8, num_int8) - self.assertEqual(0, num_int4) + self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) + self.assertEqual(0, num_weight_nodes["int4"]) model.save_pretrained(tmp_dir) @@ -442,10 +443,10 @@ def test_stable_diffusion_with_weight_compression(self): quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config)) - num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(int8_pipe.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(int8_pipe.unet) self.assertEqual(0, num_fake_quantize) - self.assertEqual(242, num_int8) - self.assertEqual(0, num_int4) + self.assertEqual(242, num_weight_nodes["int8"]) + self.assertEqual(0, num_weight_nodes["int4"]) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION[-1:]) def test_ovmodel_hybrid_quantization_with_custom_dataset( @@ -461,10 +462,10 @@ def test_ovmodel_hybrid_quantization_with_custom_dataset( self.assertEqual(quantization_config.quant_method, OVQuantizationMethod.HYBRID) quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config), calibration_dataset=dataset) - num_fake_quantize, num_int8, num_int4 = get_num_quantized_nodes(model.unet) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model.unet) self.assertEqual(expected_num_fake_quantize, num_fake_quantize) - self.assertEqual(expected_ov_int8, num_int8) - self.assertEqual(0, num_int4) + self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) + self.assertEqual(0, num_weight_nodes["int4"]) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_AUTOCOMPRESSED_MATMULS) @unittest.mock.patch.dict( @@ -478,9 +479,9 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - _, num_int8, num_int4 = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int4, num_int4) - self.assertEqual(expected_ov_int8, num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_ov_int4, num_weight_nodes["int4"]) + self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) @@ -492,7 +493,7 @@ def test_ovmodel_4bit_auto_compression(self, model_cls, model_type, expected_ov_ @parameterized.expand(LOAD_IN_4_BITS_SCOPE) def test_ovmodel_4bit_auto_compression_with_config( - self, model_cls, model_name, quantization_config, expected_ov_int4 + self, model_cls, model_name, quantization_config, expected_num_weight_nodes ): model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: @@ -506,13 +507,14 @@ def test_ovmodel_4bit_auto_compression_with_config( if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - _, num_int4, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int4, num_int4) + _, num_weight_nodes = get_num_quantized_nodes(model) + expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) + self.assertEqual(expected_num_weight_nodes, num_weight_nodes) model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(openvino_config.quantization_config.bits, 4) - self.assertEqual(openvino_config.dtype, "int4") + self.assertEqual(openvino_config.dtype, quantization_config.weight_format) @parameterized.expand(((OVModelForCausalLM, "gpt2"),)) def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_type): @@ -521,8 +523,8 @@ def test_ovmodel_stateful_load_with_compressed_weights(self, model_cls, model_ty self.assertTrue(model.use_cache) expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type][0] - _, num_int8, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int8, num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(expected_ov_int8, num_weight_nodes["int8"]) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): @@ -536,8 +538,8 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): models = [model] for i, model in enumerate(models): - _, num_int8, _ = get_num_quantized_nodes(model) - self.assertEqual(0, num_int8) + _, num_weight_nodes = get_num_quantized_nodes(model) + self.assertEqual(0, num_weight_nodes["int8"]) def test_ovmodel_load_large_model_with_default_compressed_weights(self): def main_export_in_stacktrace(*args, **kwargs): @@ -616,7 +618,9 @@ def main_export_not_in_stacktrace(*args, **kwargs): compress_weights_patch.assert_called_with(unittest.mock.ANY, **compression_params) @parameterized.expand(LOAD_IN_4_BITS_SCOPE) - def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantization_config, expected_ov_int4): + def test_ovmodel_4bit_dynamic_with_config( + self, model_cls, model_name, quantization_config, expected_num_weight_nodes + ): model_id = MODEL_NAMES[model_name] with tempfile.TemporaryDirectory() as tmp_dir: group_size = quantization_config.pop("group_size", 32) @@ -631,13 +635,14 @@ def test_ovmodel_4bit_dynamic_with_config(self, model_cls, model_name, quantizat if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token - _, num_int4, _ = get_num_quantized_nodes(model) - self.assertEqual(expected_ov_int4, num_int4) + _, num_weight_nodes = get_num_quantized_nodes(model) + expected_num_weight_nodes.update({k: 0 for k in set(num_weight_nodes) - set(expected_num_weight_nodes)}) + self.assertEqual(expected_num_weight_nodes, num_weight_nodes) model.save_pretrained(tmp_dir) openvino_config = OVConfig.from_pretrained(tmp_dir) self.assertEqual(openvino_config.quantization_config.bits, 4) - self.assertEqual(openvino_config.dtype, "int4") + self.assertEqual(openvino_config.dtype, quantization_config.weight_format) class OVQuantizerQATest(unittest.TestCase): @@ -764,9 +769,9 @@ def compute_metrics(p): trainer.save_model() model = OVModelForSequenceClassification.from_pretrained(tmp_dir) - num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) + num_fake_quantize, num_weight_nodes = get_num_quantized_nodes(model) self.assertEqual(expected_fake_quantize, num_fake_quantize) - self.assertEqual(expected_int8, num_int8) + self.assertEqual(expected_int8, num_weight_nodes["int8"]) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) @@ -793,7 +798,7 @@ class OVQuantizationConfigTest(unittest.TestCase): quant_method=OVQuantizationMethod.DEFAULT, ), ), - (OVWeightQuantizationConfig(dataset=["hello world", "i'm alive"]),), + (OVWeightQuantizationConfig(bits=4, dataset=["hello world", "i'm alive"]),), ( OVQuantizationConfig( ignored_scope={"names": ["op_name"]}, @@ -836,7 +841,7 @@ class OVQuantizationConfigTest(unittest.TestCase): (dict(num_samples=100), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), (dict(abc="def"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config"), ( - dict(bits=8, fast_bias_correction=True, dataset="wikitext2"), + dict(bits=4, fast_bias_correction=True, dataset="wikitext2"), OVWeightQuantizationConfig, "Can't determine type of OV quantization config", ), @@ -858,7 +863,7 @@ class OVQuantizationConfigTest(unittest.TestCase): (dict(abc="def", weight_only=False), OVQuantizationConfig, None), (dict(abc="def", weight_only=True), OVWeightQuantizationConfig, None), ( - dict(bits=8, fast_bias_correction=True, dataset="wikitext2", weight_only=True), + dict(bits=4, fast_bias_correction=True, dataset="wikitext2", weight_only=True), OVWeightQuantizationConfig, None, ), diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index f13723eef5..86a0a51e80 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -167,14 +167,23 @@ def get_num_quantized_nodes(ov_model): num_fake_quantize = 0 - num_int8 = 0 - num_int4 = 0 + num_weight_nodes = { + "int8": 0, + "int4": 0, + "f4e2m1": 0, + "f8e8m0": 0, + } for elem in ov_model.model.get_ops(): if "FakeQuantize" in elem.name: num_fake_quantize += 1 for i in range(elem.get_output_size()): - if elem.get_output_element_type(i).get_type_name() in ["i8", "u8"]: - num_int8 += 1 - if elem.get_output_element_type(i).get_type_name() in ["i4", "u4"]: - num_int4 += 1 - return num_fake_quantize, num_int8, num_int4 + type_name = elem.get_output_element_type(i).get_type_name() + if type_name in ["i8", "u8"]: + num_weight_nodes["int8"] += 1 + if type_name in ["i4", "u4"]: + num_weight_nodes["int4"] += 1 + if type_name == "f4e2m1": + num_weight_nodes["f4e2m1"] += 1 + if type_name == "f8e8m0": + num_weight_nodes["f8e8m0"] += 1 + return num_fake_quantize, num_weight_nodes From d4e3128300eecba2bbded4a2fa2fa7a1bbbc78e6 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Tue, 3 Sep 2024 17:33:16 +0800 Subject: [PATCH 6/9] Deprecate ipex inference mode (#837) * rm ipex inference * rm inference mode --- optimum/intel/__init__.py | 2 - optimum/intel/ipex/__init__.py | 2 - optimum/intel/ipex/inference.py | 158 ---------------------- optimum/intel/utils/dummy_ipex_objects.py | 7 - tests/ipex/test_inference.py | 117 ---------------- 5 files changed, 286 deletions(-) delete mode 100644 optimum/intel/ipex/inference.py delete mode 100644 tests/ipex/test_inference.py diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 2f2d1cb669..7f76c28543 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -44,7 +44,6 @@ ] else: _import_structure["ipex"] = [ - "inference_mode", "IPEXModelForCausalLM", "IPEXModelForSequenceClassification", "IPEXModelForMaskedLM", @@ -196,7 +195,6 @@ IPEXModelForQuestionAnswering, IPEXModelForSequenceClassification, IPEXModelForTokenClassification, - inference_mode, ) try: diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py index 83943176b2..c1f711acfc 100644 --- a/optimum/intel/ipex/__init__.py +++ b/optimum/intel/ipex/__init__.py @@ -22,5 +22,3 @@ IPEXModelForSequenceClassification, IPEXModelForTokenClassification, ) - -from .inference import inference_mode diff --git a/optimum/intel/ipex/inference.py b/optimum/intel/ipex/inference.py deleted file mode 100644 index a628ebe12e..0000000000 --- a/optimum/intel/ipex/inference.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# ruff: noqa - -import logging -from typing import Union - -import torch -from torch import nn -from transformers import add_start_docstrings -from transformers.pipelines import Pipeline -from transformers.utils import is_ipex_available - -from ...exporters.tasks import TasksManager -from ..generation.modeling import jit_trace -from .modeling_base import ( - IPEXModel, - IPEXModelForCausalLM, - IPEXModelForMaskedLM, - IPEXModelForSequenceClassification, - IPEXModelForTokenClassification, - IPEXModelForQuestionAnswering, -) - - -from .utils import _HEAD_TO_AUTOMODELS - - -logger = logging.getLogger(__name__) - -IPEX_NOT_AVAILABLE_ERROR_MSG = ( - "Intel PyTorch Extensions was not found." - "please make sure you've installed the package or run " - "pip install intel_extension_for_pytorch" -) - -if is_ipex_available(): - import intel_extension_for_pytorch as ipex - - -class _ModelFallbackWrapper: - __slots__ = ("_optimized", "_default") - - def __init__(self, optimized, default): - self._optimized = optimized - self._default = default - - def __call__(self, *args, **kwargs): - try: - return self._optimized(*args, **kwargs) - except Exception: - return self._default(*args, **kwargs) - - def __getattr__(self, item): - if not item.startswith("__"): - return getattr(self._default, item) - else: - return self.item - - -@add_start_docstrings( - """ - inference_mode is an Intel specific context-manager analogous to PyTorch's inference_mode to use for inference - workload on Intel CPUs, especially Intel Xeon Scalable CPUs. - """, -) -class inference_mode: - __slots__ = ("_model", "_dtype", "_graph_mode", "_verbose", "_original", "_jit") - - def __init__( - self, - model: Union[nn.Module, Pipeline], - dtype: torch.dtype = torch.float32, - **kwargs, - ): - """ - Args: - model (`torch.nn.Module` or `transformers.Pipeline`): - The model or pipeline instance to optimize. - dtype (`torch.dtype = torch.float32`), *optional*): - The data type used to do the computation. - Acceptable type are `torch.float32` (default) and `torch.bfloat16`. - Please note `torch.bfloat16` requires `avx512_bf16` instructions set as present on - 4th Generation of Intel Xeon Scalable CPUs (Sapphire Rapids). - jit (`boolean = False`, *optional*): - Enable jit to accelerate inference speed - """ - logger.warning( - "`inference_mode` is deprecated and will be removed in v1.18.0. Use `pipeline` to load and export your model to TorchScript instead." - ) - - if not is_ipex_available(): - raise ImportError(IPEX_NOT_AVAILABLE_ERROR_MSG) - - self._model = model - self._dtype = dtype - self._graph_mode = False # Let's keep for future use when it doesn't hang anymore - self._original = None - - if "jit" in kwargs: - logger.warning( - "`jit` is deprecated and will be removed in a future version. Use `IPEXModel` to load and export your model to TorchScript instead." - ) - self._jit = kwargs.pop("jit", False) - - def __enter__(self): - if self._model.framework == "pt": - with torch.inference_mode(): - try: - ipex.enable_onednn_fusion(True) - - self._original = self._model.model if isinstance(self._model, Pipeline) else self._model - model = ipex.optimize( - self._original, - dtype=self._dtype, - graph_mode=self._graph_mode, - level="O1", - auto_kernel_selection=True, - ) - if self._jit: - use_cache = getattr(self._original.config, "use_cache", False) - task = ( - self._model.task - if isinstance(self._model, Pipeline) - else TasksManager._infer_task_from_model_or_model_class(model) - ) - if task in _HEAD_TO_AUTOMODELS: - model = jit_trace(model, task, use_cache) - auto_model_class = eval(_HEAD_TO_AUTOMODELS[task]) - model = auto_model_class(model, self._original.config, use_cache=use_cache) - - # Enable automatic mixed precision (AMP) if we are going to target `bfloat16` - with torch.cpu.amp.autocast(enabled=self._dtype == torch.bfloat16): - if isinstance(self._model, Pipeline): - # Patching model with the new one - self._model.model = _ModelFallbackWrapper(model, self._original) - return self._model - return model - - except RuntimeError: - return self._model - else: - return self._model - - def __exit__(self, exc_type, exc_val, exc_tb): - self._model = self._original diff --git a/optimum/intel/utils/dummy_ipex_objects.py b/optimum/intel/utils/dummy_ipex_objects.py index c451dd3956..4bd7eee630 100644 --- a/optimum/intel/utils/dummy_ipex_objects.py +++ b/optimum/intel/utils/dummy_ipex_objects.py @@ -15,13 +15,6 @@ from .import_utils import DummyObject, requires_backends -class inference_mode(metaclass=DummyObject): - _backends = ["ipex"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["ipex"]) - - class IPEXModel(metaclass=DummyObject): _backends = ["ipex"] diff --git a/tests/ipex/test_inference.py b/tests/ipex/test_inference.py deleted file mode 100644 index 1a452fe408..0000000000 --- a/tests/ipex/test_inference.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import torch -from parameterized import parameterized -from transformers import ( - AutoModelForCausalLM, - AutoModelForQuestionAnswering, - AutoModelForSequenceClassification, - AutoModelForTokenClassification, - AutoTokenizer, - pipeline, -) -from utils_tests import MODEL_NAMES - -from optimum.intel import inference_mode as ipex_inference_mode -from optimum.intel.ipex.modeling_base import IPEXModel - - -_CLASSIFICATION_TASK_TO_AUTOMODELS = { - "text-classification": AutoModelForSequenceClassification, - "token-classification": AutoModelForTokenClassification, -} - - -class IPEXClassificationTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ( - "bert", - "distilbert", - "roberta", - ) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_pipeline_inference(self, model_arch): - model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs = "This is a sample input" - for task, auto_model_class in _CLASSIFICATION_TASK_TO_AUTOMODELS.items(): - model = auto_model_class.from_pretrained(model_id, torch_dtype=torch.float32) - pipe = pipeline(task, model=model, tokenizer=tokenizer) - - with torch.inference_mode(): - outputs = pipe(inputs) - with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe: - outputs_ipex = ipex_pipe(inputs) - self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule)) - self.assertEqual(outputs[0]["score"], outputs_ipex[0]["score"]) - - -class IPEXQuestionAnsweringTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ( - "bert", - "distilbert", - "roberta", - ) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_pipeline_inference(self, model_arch): - model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForQuestionAnswering.from_pretrained(model_id, torch_dtype=torch.float32) - pipe = pipeline("question-answering", model=model, tokenizer=tokenizer) - - with torch.inference_mode(): - outputs = pipe(question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris.") - with ipex_inference_mode(pipe, dtype=model.config.torch_dtype, verbose=False, jit=True) as ipex_pipe: - outputs_ipex = ipex_pipe( - question="Where was HuggingFace founded ?", context="HuggingFace was founded in Paris." - ) - self.assertTrue(isinstance(ipex_pipe.model._optimized.model, torch.jit.RecursiveScriptModule)) - self.assertEqual(outputs["start"], outputs_ipex["start"]) - self.assertEqual(outputs["end"], outputs_ipex["end"]) - - -class IPEXTextGenerationTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = ( - "bloom", - "gptj", - "gpt2", - "gpt_neo", - "gpt_bigcode", - "llama", - "llama2", - "opt", - "mpt", - ) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_pipeline_inference(self, model_arch): - model_id = MODEL_NAMES[model_arch] - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, return_dict=False) - model = model.eval() - tokenizer = AutoTokenizer.from_pretrained(model_id) - inputs = "This is a simple input" - text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer) - with torch.inference_mode(): - output = text_generator(inputs) - with ipex_inference_mode( - text_generator, dtype=model.config.torch_dtype, verbose=False, jit=True - ) as ipex_text_generator: - output_ipex = ipex_text_generator(inputs) - self.assertTrue(isinstance(ipex_text_generator.model._optimized, IPEXModel)) - self.assertTrue(isinstance(ipex_text_generator.model._optimized.model, torch.jit.RecursiveScriptModule)) - self.assertEqual(output[0]["generated_text"], output_ipex[0]["generated_text"]) From 2696e6fe4729424bc4f85a508b48c69217e3eb17 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Tue, 3 Sep 2024 13:56:06 +0400 Subject: [PATCH 7/9] Add export config for gemma2 (#876) * add export config for gemma2 * update cache position and tests * update model list * fix without cache export * patch original torch gemma2 to work with dynamic cache * Update tests/openvino/test_modeling.py * prevent usage cache implementation * add min transformers version --- docs/source/openvino/models.mdx | 1 + optimum/exporters/openvino/model_configs.py | 21 ++++++++ optimum/exporters/openvino/model_patcher.py | 60 ++++++++++++++++++++- optimum/intel/openvino/modeling_decoder.py | 2 + tests/openvino/test_modeling.py | 33 ++++++++++-- tests/openvino/utils_tests.py | 1 + 6 files changed, 112 insertions(+), 6 deletions(-) diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index b82e68fe4e..07da423f8d 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -55,6 +55,7 @@ Here is the list of the supported architectures : - GPT-NeoX - GPT-NeoX-Japanese - Gemma +- Gemma2 - Hubert - IBert - InternLM diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 0ad38927a2..ca62b82de6 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -54,6 +54,7 @@ CodeGenModelPatcher, DBRXModelPatcher, FalconModelPatcher, + Gemma2ModelPatcher, GptNeoxJapaneseModelPatcher, GptNeoxModelPatcher, InternLM2Patcher, @@ -997,3 +998,23 @@ def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs) + + +@register_in_tasks_manager( + "gemma2", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class Gemma2OpenVINOConfig(GemmaOnnxConfig): + MIN_TRANSFORMERS_VERSION = version.parse("4.43.0") + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 8cb745bd72..59d4bedb51 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import inspect import logging as log import math @@ -23,7 +24,7 @@ from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.utils import is_tf_available -from optimum.exporters.onnx.model_patcher import DecoderModelPatcher +from optimum.exporters.onnx.model_patcher import DecoderModelPatcher, override_arguments from optimum.intel.utils.import_utils import ( _openvino_version, _torch_version, @@ -2409,3 +2410,60 @@ def __enter__(self): super().__enter__() for layer in self._model.gpt_neox_japanese.layers: _reinitialize_cos_sin_cached_fp32(layer.attention.rotary_emb) + + +class Gemma2ModelPatcher(LlamaModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + @functools.wraps(self.orig_forward) + def patched_forward(*args, **kwargs): + from transformers.cache_utils import DynamicCache + + signature = inspect.signature(self.orig_forward) + args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) + + return_legacy_cache = False + pkv_in_args = False + legacy_pkv = None + if "past_key_values" in kwargs: + legacy_pkv = kwargs.pop("past_key_values", None) + sign_names = list(signature.parameters.keys()) + pkv_argument_index = sign_names.index("past_key_values") + cache_position_index = sign_names.index("cache_position") if "cache_position" in sign_names else -1 + input_ids_index = sign_names.index("input_ids" if "input_ids" in sign_names else "inputs_embeds") + if legacy_pkv is None and len(args) > pkv_argument_index: + legacy_pkv = args[pkv_argument_index] + pkv_in_args = True + if legacy_pkv is not None: + pkv = DynamicCache.from_legacy_cache(legacy_pkv) + return_legacy_cache = True + if not pkv_in_args: + kwargs["past_key_values"] = pkv + else: + args[pkv_argument_index] = pkv + + if ( + return_legacy_cache + and cache_position_index != -1 + and (cache_position_index > len(args) and "cache_position" not in kwargs) + ): + past_seen_tokens = legacy_pkv[0][0].shape[-2] + input_ids = args[input_ids_index] + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + input_ids.shape[1], device=input_ids.device + ) + kwargs["cache_position"] = cache_position + + outputs = self.orig_forward(*args, **kwargs) + if return_legacy_cache: + outputs.past_key_values = outputs.past_key_values.to_legacy_cache() + + return outputs + + self.patched_forward = patched_forward diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 554fdee7cd..de094508c9 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -806,6 +806,8 @@ def _from_pretrained( force_download=force_download, local_files_only=local_files_only, ) + if getattr(generation_config, "cache_implementation", None) is not None: + generation_config.cache_implementation = None kwargs["generation_config"] = generation_config except Exception: pass diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index d71bbea453..6f24ea0de5 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -57,6 +57,7 @@ from transformers.testing_utils import slow from utils_tests import MODEL_NAMES +from optimum.exporters.openvino.model_patcher import patch_update_causal_mask from optimum.intel import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, @@ -647,6 +648,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.40.0"): SUPPORTED_ARCHITECTURES += ( "gemma", + "gemma2", "olmo", "stablelm", "starcoder2", @@ -728,7 +730,8 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(torch.allclose(ov_outputs.logits, transformers_outputs.logits, equal_nan=True, atol=1e-4)) # Qwen tokenizer does not support padding - if model_arch == "qwen": + + if model_arch in ["qwen"]: return if model_arch not in ["chatglm", "glm4", "persimmon"]: @@ -753,7 +756,16 @@ def test_compare_to_transformers(self, model_arch): ) ov_outputs = ov_model.generate(**tokens, generation_config=gen_config) - transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + additional_inputs = {} + # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, + # align cache representation in torch model + if model_arch == "gemma2": + patch_update_causal_mask(transformers_model, "4.43.0") + transformers_model._supports_cache_class = True + from transformers.cache_utils import DynamicCache + + additional_inputs = {"past_key_values": DynamicCache()} + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config, **additional_inputs) self.assertTrue(torch.allclose(ov_outputs, transformers_outputs)) del transformers_model @@ -921,8 +933,8 @@ def test_beam_search(self, model_arch): "config": AutoConfig.from_pretrained(model_id, trust_remote_code=True), "trust_remote_code": True, } - # Qwen tokenizer does not support padding, chatgm testing model produces nan that incompatible with beam search - if model_arch in ["qwen", "chatglm"]: + # Qwen tokenizer does not support padding, chatglm, glm4 testing models produce nan that incompatible with beam search + if model_arch in ["qwen", "chatglm", "glm4"]: return tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS) @@ -988,6 +1000,12 @@ def test_beam_search(self, model_arch): if model_arch == "arctic": transformers_model.to(torch.float32) + additional_inputs = {} + # gemma2 does not support dynamic cache, it is unfair to compare dynamic cache result vs hybrid cache, align cache representation in torch model + if model_arch == "gemma2": + patch_update_causal_mask(transformers_model, "4.43.0") + transformers_model._supports_cache_class = True + from transformers.cache_utils import DynamicCache tokenizer.pad_token_id = tokenizer.eos_token_id tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) tokens.pop("token_type_ids", None) @@ -1002,7 +1020,12 @@ def test_beam_search(self, model_arch): if gen_config.do_sample and model_arch in ["baichuan2-13b", "olmo"]: continue set_seed(SEED) - transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + + if model_arch == "gemma2": + additional_inputs = {"past_key_values": DynamicCache()} + transformers_outputs = transformers_model.generate( + **tokens, generation_config=gen_config, **additional_inputs + ) set_seed(SEED) ov_stateful_outputs = ov_model_stateful.generate(**tokens, generation_config=gen_config) self.assertTrue( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 86a0a51e80..869d5897e6 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -54,6 +54,7 @@ "electra": "hf-internal-testing/tiny-random-electra", "exaone": "katuni4ka/tiny-random-exaone", "gemma": "fxmarty/tiny-random-GemmaForCausalLM", + "gemma2": "katuni4ka/tiny-random-gemma2", "falcon": "fxmarty/really-tiny-falcon-testing", "falcon-40b": "katuni4ka/tiny-random-falcon-40b", "flaubert": "hf-internal-testing/tiny-random-flaubert", From b51ca3f66b3f7322749884288b69187adba007fd Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:59:52 +0200 Subject: [PATCH 8/9] Transformers 4.44 support (#877) * test * fix ipex bloom * fix bloom * style * fix * use bloom specific modeling when export version is lower than 4.44 * fix --- .github/workflows/test_ipex.yml | 2 +- .github/workflows/test_openvino.yml | 2 +- .github/workflows/test_openvino_basic.yml | 2 +- optimum/exporters/ipex/model_patcher.py | 2 +- optimum/exporters/openvino/stateful.py | 6 +++-- optimum/intel/ipex/modeling_base.py | 2 +- optimum/intel/openvino/modeling_base.py | 2 +- optimum/intel/openvino/modeling_decoder.py | 27 +++++++++++++++------- setup.py | 6 ++--- 9 files changed, 32 insertions(+), 19 deletions(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 8b97bdd535..8cdfe30b58 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -22,7 +22,7 @@ jobs: fail-fast: false matrix: python-version: [3.9] - transformers-version: ["4.39.0", "4.43.*"] + transformers-version: ["4.39.0", "4.44.*"] ipex-version: ["2.2.0", "2.3.*"] include: - python-version: 3.8 diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 2262407898..335acf669b 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -21,7 +21,7 @@ jobs: fail-fast: false matrix: python-version: ["3.8", "3.12"] - transformers-version: ["4.36.0", "4.43.*"] + transformers-version: ["4.36.0", "4.44.*"] os: [ubuntu-latest] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml index 28c8369c75..ced98dd9ab 100644 --- a/.github/workflows/test_openvino_basic.yml +++ b/.github/workflows/test_openvino_basic.yml @@ -24,7 +24,7 @@ jobs: # This also ensures that the test fails if dependencies break for Python 3.7 python-version: ["3.8", "3.12"] os: ["ubuntu-22.04", "windows-latest"] - transformers-version: ["4.43.*"] + transformers-version: ["4.44.*"] include: - python-version: "3.12" os: "ubuntu-22.04" diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 216c1c3918..484fd38077 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -40,7 +40,7 @@ # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version _TRANSFORMERS_MIN_VERSION = "4.39.0" -_TRANSFORMERS_MAX_VERSION = "4.43.99" +_TRANSFORMERS_MAX_VERSION = "4.44.99" _IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",) diff --git a/optimum/exporters/openvino/stateful.py b/optimum/exporters/openvino/stateful.py index d8132107a1..a0c82108f1 100644 --- a/optimum/exporters/openvino/stateful.py +++ b/optimum/exporters/openvino/stateful.py @@ -21,7 +21,7 @@ import openvino as ov from openvino.runtime import opset13 from optimum.exporters import TasksManager -from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version +from optimum.intel.utils.import_utils import _openvino_version, is_openvino_version, is_transformers_version def model_has_state(ov_model: ov.Model): @@ -216,7 +216,9 @@ def patch_stateful(config: PretrainedConfig, ov_model: ov.Model): batch_dim = 1 if config.model_type == "chatglm" and not hasattr(config, "rope_ratio") else 0 fuse_cache_reorder(ov_model, not_kv_inputs, key_value_input_names, batch_dim) - num_attention_heads = config.num_attention_heads if config.model_type == "bloom" else 1 + num_attention_heads = ( + config.num_attention_heads if (config.model_type == "bloom" and is_transformers_version("<", "4.44")) else 1 + ) make_stateful( ov_model, not_kv_inputs, key_value_input_names, key_value_output_names, batch_dim, num_attention_heads, None ) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 568e5be629..d6467f76a2 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -564,7 +564,7 @@ def _prepare_past_key_values(self, input_ids): ] ) return past_key_values - elif model_type == "bloom": + elif model_type == "bloom" and is_transformers_version("<", "4.44"): shape_key = (batch_size * num_attention_heads, d_k, 0) shape_value = (batch_size * num_attention_heads, 0, d_k) key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 90c43b7805..98fec17351 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -140,7 +140,7 @@ def dtype(self) -> Optional[torch.dtype]: def load_model( file_name: Union[str, Path], quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, - ): + ) -> openvino.runtime.Model: """ Loads the model. diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index de094508c9..23117e9361 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -25,6 +25,7 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from openvino.preprocess import PrePostProcessor from openvino.runtime import Core, Tensor, Type +from packaging.version import Version from transformers import AutoModelForCausalLM, PretrainedConfig from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward from transformers.generation import GenerationMixin @@ -38,7 +39,7 @@ from ...exporters.openvino import ensure_stateful_is_available, main_export, patch_stateful from ...exporters.openvino.stateful import model_has_state -from ..utils.import_utils import is_nncf_available, is_transformers_version +from ..utils.import_utils import compare_versions, is_nncf_available, is_transformers_version from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS from .configuration import ( OVConfig, @@ -51,8 +52,8 @@ if TYPE_CHECKING: + from transformers.generation.streamers import BaseStreamer from transformers.modeling_utils import PreTrainedModel - from transformers.streamers import BaseStreamer logger = logging.getLogger(__name__) @@ -404,7 +405,10 @@ def prepare_inputs( **kwargs, ) -> Dict: batch_size = input_ids.shape[0] - if self.config.model_type == "bloom": + model_transformers_version = Version( + self.model.rt_info["optimum"]["transformers_version"].value if "optimum" in self.model.rt_info else "0.0.0" + ) + if self.config.model_type == "bloom" and compare_versions(model_transformers_version, "<", "4.44"): batch_size *= self.config.num_attention_heads inputs = {} @@ -619,7 +623,10 @@ def _deduplicate_inputs(self, model_inputs: Dict): shape = input_tensor.shape if isinstance(input_tensor, Tensor) else list(input_tensor.shape) dtype = input_tensor.element_type if isinstance(input_tensor, Tensor) else Type(input_tensor.dtype) upd_batch_size = indicies.shape[0] - if self.config.model_type == "bloom": + export_transformers_version = Version(self.model.rt_info["optimum"]["transformers_version"].value) + if self.config.model_type == "bloom" and compare_versions( + export_transformers_version, "<", "4.44" + ): upd_batch_size *= self.config.num_attention_heads shape[ ( @@ -631,10 +638,11 @@ def _deduplicate_inputs(self, model_inputs: Dict): upd_model_inputs[input_name] = Tensor(dtype, shape) upd_model_inputs["input_ids"] = unique_input_ids if "beam_idx" in model_inputs: + export_transformers_version = Version(self.model.rt_info["optimum"]["transformers_version"].value) beam_range = ( - unique_input_ids.shape[0] - if self.config.model_type != "bloom" - else unique_input_ids.shape[0] * self.config.num_attention_heads + unique_input_ids.shape[0] * self.config.num_attention_heads + if (self.config.model_type == "bloom" and compare_versions(export_transformers_version, "<", "4.44")) + else unique_input_ids.shape[0] ) beam_idx = np.arange(beam_range, dtype=int) upd_model_inputs["beam_idx"] = beam_idx @@ -781,7 +789,10 @@ def _from_pretrained( model = cls.load_model(model_cache_path) model_type = config.model_type.replace("_", "-") - if model_type == "bloom": + export_transformers_version = Version( + model.rt_info["optimum"]["transformers_version"].value if "optimum" in model.rt_info else "0.0.0" + ) + if model_type == "bloom" and compare_versions(export_transformers_version, "<", "4.44"): init_cls = OVBloomForCausalLM elif model_type == "gpt-bigcode": init_cls = OVGPTBigCodeForCausalLM diff --git a/setup.py b/setup.py index e637f49e18..cd488f8301 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "transformers>=4.36.0,<4.44.0", + "transformers>=4.36,<4.45", "optimum@git+https://github.com/huggingface/optimum.git", "datasets>=1.4.0", "sentencepiece", @@ -59,10 +59,10 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor>=2.2.0,<3.0", "accelerate", "transformers<4.43.0"], + "neural-compressor": ["neural-compressor>=2.2.0,<3.0", "accelerate", "transformers<4.43"], "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.11.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.39,<4.45"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, From 40194a016723acc99b73310cc5320c346f15f691 Mon Sep 17 00:00:00 2001 From: Nikita Savelyev Date: Wed, 4 Sep 2024 10:01:45 +0200 Subject: [PATCH 9/9] Deprecate export parameters (#886) --- optimum/exporters/openvino/__main__.py | 38 -------------------------- tests/openvino/test_export.py | 3 -- tests/openvino/test_exporters_cli.py | 6 +--- 3 files changed, 1 insertion(+), 46 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index c4b6ef0cd8..842198625d 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -49,15 +49,6 @@ import torch -_COMPRESSION_OPTIONS = { - "int8": {"bits": 8}, - "int4_sym_g128": {"bits": 4, "sym": True, "group_size": 128}, - "int4_asym_g128": {"bits": 4, "sym": False, "group_size": 128}, - "int4_sym_g64": {"bits": 4, "sym": True, "group_size": 64}, - "int4_asym_g64": {"bits": 4, "sym": False, "group_size": 64}, -} - - logger = logging.getLogger(__name__) @@ -108,8 +99,6 @@ def main_export( model_kwargs: Optional[Dict[str, Any]] = None, custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, - compression_option: Optional[str] = None, - compression_ratio: Optional[float] = None, ov_config: "OVConfig" = None, stateful: bool = True, convert_tokenizer: bool = False, @@ -171,11 +160,6 @@ def main_export( fn_get_submodels (`Optional[Callable]`, defaults to `None`): Experimental usage: Override the default submodels that are used at the export. This is especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success. - compression_option (`Optional[str]`, defaults to `None`): - The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point, - `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression. - compression_ratio (`Optional[float]`, defaults to `None`): - Compression ratio between primary and backup precision (only relevant to INT4). stateful (`bool`, defaults to `True`): Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models. **kwargs_shapes (`Dict`): @@ -198,28 +182,6 @@ def main_export( raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") token = use_auth_token - if compression_option is not None: - logger.warning( - "The `compression_option` argument is deprecated and will be removed in optimum-intel v1.17.0. " - "Please, pass an `ov_config` argument instead `OVConfig(..., quantization_config=quantization_config)`." - ) - - if compression_ratio is not None: - logger.warning( - "The `compression_ratio` argument is deprecated and will be removed in optimum-intel v1.17.0. " - "Please, pass an `ov_config` argument instead `OVConfig(quantization_config={ratio=compression_ratio})`." - ) - - if ov_config is None and compression_option is not None: - from ...intel.openvino.configuration import OVConfig - - if compression_option == "fp16": - ov_config = OVConfig(dtype="fp16") - elif compression_option != "fp32": - q_config = _COMPRESSION_OPTIONS[compression_option] if compression_option in _COMPRESSION_OPTIONS else {} - q_config["ratio"] = compression_ratio or 1.0 - ov_config = OVConfig(quantization_config=q_config) - original_task = task task = infer_task( task, model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token diff --git a/tests/openvino/test_export.py b/tests/openvino/test_export.py index ef20ed5a2d..d48e86fe27 100644 --- a/tests/openvino/test_export.py +++ b/tests/openvino/test_export.py @@ -16,7 +16,6 @@ import unittest from pathlib import Path from tempfile import TemporaryDirectory -from typing import Optional import torch from parameterized import parameterized @@ -76,7 +75,6 @@ class ExportModelTest(unittest.TestCase): def _openvino_export( self, model_type: str, - compression_option: Optional[str] = None, stateful: bool = True, patch_16bit_model: bool = False, ): @@ -106,7 +104,6 @@ def _openvino_export( output=Path(tmpdirname), task=supported_task, preprocessors=preprocessors, - compression_option=compression_option, stateful=stateful, ) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 6380a52881..9da496ae05 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -108,16 +108,12 @@ class OVCLIExportTestCase(unittest.TestCase): ), ] - def _openvino_export( - self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None - ): + def _openvino_export(self, model_name: str, task: str): with TemporaryDirectory() as tmpdir: main_export( model_name_or_path=model_name, output=tmpdir, task=task, - compression_option=compression_option, - compression_ratio=compression_ratio, ) @parameterized.expand(SUPPORTED_ARCHITECTURES)