Merge branch 'main' into wangchang/inc_woq

changwangss · Sep 5, 2024 · 2683608 · 2683608
2 parents 6eba7c4 + 40194a0
commit 2683608
Show file tree

Hide file tree

Showing 31 changed files with 509 additions and 569 deletions.
diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
@@ -22,7 +22,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.9]
-        transformers-version: ["4.39.0", "4.43.*"]
+        transformers-version: ["4.39.0", "4.44.*"]
         ipex-version: ["2.2.0", "2.3.*"]
         include:
           - python-version: 3.8

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -21,7 +21,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8", "3.12"]
-        transformers-version: ["4.36.0", "4.43.*"]
+        transformers-version: ["4.36.0", "4.44.*"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
@@ -51,7 +51,6 @@ jobs:
           pytest tests/openvino/test_modeling_basic.py
       - name: Test openvino-nightly
         run: |
-          pip uninstall -y openvino
-          pip install openvino-nightly
+          pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
           python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
           optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
diff --git a/.github/workflows/test_openvino_basic.yml b/.github/workflows/test_openvino_basic.yml
@@ -24,7 +24,7 @@ jobs:
         # This also ensures that the test fails if dependencies break for Python 3.7
         python-version: ["3.8", "3.12"]
         os: ["ubuntu-22.04", "windows-latest"]
-        transformers-version: ["4.43.*"]
+        transformers-version: ["4.44.*"]
         include:
           - python-version: "3.12"
             os: "ubuntu-22.04"

diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel
@@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     libpng-dev \
     python3 \
     python3-pip \
+    python3-dev \
+    libnuma-dev \
     && rm -rf /var/lib/apt/lists/*"
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
@@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \
     torchaudio==${TORCHAUDIO_VERSION} \
     -f https://download.pytorch.org/whl/torch_stable.html && \
     python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
-    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python3 -m pip install --no-cache-dir  numa
 
-ARG OMP_NUM_THREADS=1
-ENV OMP_NUM_THREADS=${OMP_NUM_THREADS}
 ARG KMP_BLOCKTIME=1
 ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
 ARG KMP_HW_SUBSET=1T
 ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
-ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
@@ -55,6 +55,7 @@ Here is the list of the supported architectures :
 - GPT-NeoX
 - GPT-NeoX-Japanese
 - Gemma
+- Gemma2
 - Hubert
 - IBert
 - InternLM

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -70,9 +70,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--weight-format",
         type=str,
-        choices=["fp32", "fp16", "int8", "int4", "int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"],
+        choices=["fp32", "fp16", "int8", "int4", "mxfp4"],
         default=None,
-        help="he weight format of the exported model.",
+        help="The weight format of the exported model.",
     )
     optional_group.add_argument(
         "--library",
@@ -255,12 +255,11 @@ def run(self):
         elif self.args.weight_format in {"fp16", "fp32"}:
             ov_config = OVConfig(dtype=self.args.weight_format)
         else:
-            is_int8 = self.args.weight_format == "int8"
-
-            # For int4 quantization if no parameter is provided, then use the default config if exist
-            if no_compression_parameter_provided(self.args) and not is_int8:
+            # For int4 quantization if no parameter is provided, then use the default config if exists
+            if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4":
                 quantization_config = get_default_int4_config(self.args.model)
             else:
+                is_int8 = self.args.weight_format == "int8"
                 quantization_config = {
                     "bits": 8 if is_int8 else 4,
                     "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
@@ -272,17 +271,11 @@ def run(self):
                     "quant_method": "awq" if self.args.awq else "default",
                     "sensitivity_metric": self.args.sensitivity_metric,
                     "scale_estimation": self.args.scale_estimation,
+                    "weight_format": self.args.weight_format,
                 }
 
             if quantization_config.get("dataset", None) is not None:
                 quantization_config["trust_remote_code"] = self.args.trust_remote_code
-
-            if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
-                logger.warning(
-                    f"--weight-format {self.args.weight_format} is deprecated, possible choices are fp32, fp16, int8, int4"
-                )
-                quantization_config["sym"] = "asym" not in self.args.weight_format
-                quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
             ov_config = OVConfig(quantization_config=quantization_config)
 
         quantization_config = ov_config.quantization_config if ov_config else None

diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -40,7 +40,7 @@
 
 # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
 _TRANSFORMERS_MIN_VERSION = "4.39.0"
-_TRANSFORMERS_MAX_VERSION = "4.43.99"
+_TRANSFORMERS_MAX_VERSION = "4.44.99"
 
 _IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
 

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -14,7 +14,9 @@
 
 import gc
 import logging
+import operator
 import warnings
+from functools import reduce
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
 
@@ -23,18 +25,20 @@
 from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
 from transformers.utils import is_torch_available
 
+from openvino.runtime import Core, Type, save_model
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx.base import OnnxConfig
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.openvino.convert import export_from_model
 from optimum.intel.utils.import_utils import (
+    is_nncf_available,
     is_openvino_tokenizers_available,
     is_openvino_version,
     is_transformers_version,
 )
 from optimum.utils.save_utils import maybe_load_preprocessors
 
-from .utils import clear_class_registry
+from .utils import _MAX_UNCOMPRESSED_SIZE, clear_class_registry
 
 
 if TYPE_CHECKING:
@@ -45,15 +49,6 @@
     import torch
 
 
-_COMPRESSION_OPTIONS = {
-    "int8": {"bits": 8},
-    "int4_sym_g128": {"bits": 4, "sym": True, "group_size": 128},
-    "int4_asym_g128": {"bits": 4, "sym": False, "group_size": 128},
-    "int4_sym_g64": {"bits": 4, "sym": True, "group_size": 64},
-    "int4_asym_g64": {"bits": 4, "sym": False, "group_size": 64},
-}
-
-
 logger = logging.getLogger(__name__)
 
 
@@ -104,8 +99,6 @@ def main_export(
     model_kwargs: Optional[Dict[str, Any]] = None,
     custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None,
     fn_get_submodels: Optional[Callable] = None,
-    compression_option: Optional[str] = None,
-    compression_ratio: Optional[float] = None,
     ov_config: "OVConfig" = None,
     stateful: bool = True,
     convert_tokenizer: bool = False,
@@ -167,11 +160,6 @@ def main_export(
         fn_get_submodels (`Optional[Callable]`, defaults to `None`):
             Experimental usage: Override the default submodels that are used at the export. This is
             especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
-        compression_option (`Optional[str]`, defaults to `None`):
-            The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
-            `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression.
-        compression_ratio (`Optional[float]`, defaults to `None`):
-            Compression ratio between primary and backup precision (only relevant to INT4).
         stateful (`bool`, defaults to `True`):
             Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.
         **kwargs_shapes (`Dict`):
@@ -194,28 +182,6 @@ def main_export(
             raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
         token = use_auth_token
 
-    if compression_option is not None:
-        logger.warning(
-            "The `compression_option` argument is deprecated and will be removed in optimum-intel v1.17.0. "
-            "Please, pass an `ov_config` argument instead `OVConfig(..., quantization_config=quantization_config)`."
-        )
-
-    if compression_ratio is not None:
-        logger.warning(
-            "The `compression_ratio` argument is deprecated and will be removed in optimum-intel v1.17.0. "
-            "Please, pass an `ov_config` argument instead `OVConfig(quantization_config={ratio=compression_ratio})`."
-        )
-
-    if ov_config is None and compression_option is not None:
-        from ...intel.openvino.configuration import OVConfig
-
-        if compression_option == "fp16":
-            ov_config = OVConfig(dtype="fp16")
-        elif compression_option != "fp32":
-            q_config = _COMPRESSION_OPTIONS[compression_option] if compression_option in _COMPRESSION_OPTIONS else {}
-            q_config["ratio"] = compression_ratio or 1.0
-            ov_config = OVConfig(quantization_config=q_config)
-
     original_task = task
     task = infer_task(
         task, model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
@@ -402,7 +368,7 @@ class StoreAttr(object):
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
 
-    export_from_model(
+    submodel_paths = export_from_model(
         model=model,
         output=output,
         task=task,
@@ -425,6 +391,48 @@ class StoreAttr(object):
     del model
     gc.collect()
 
+    core = Core()
+    for submodel_path in submodel_paths:
+        submodel_path = Path(output) / submodel_path
+        submodel = core.read_model(submodel_path)
+
+        quantization_config = None
+        if ov_config is None:
+            num_parameters = 0
+            for op in submodel.get_ops():
+                if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]:
+                    num_parameters += reduce(operator.mul, op.shape, 1)
+            if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
+                if is_nncf_available():
+                    quantization_config = {"bits": 8, "sym": False}
+                    logger.info("The model weights will be quantized to int8_asym.")
+                else:
+                    logger.warning(
+                        "The model will be converted with no weights quantization. Quantization of the weights to int8 "
+                        "requires nncf. Please install it with `pip install nncf`"
+                    )
+                    break
+        else:
+            quantization_config = ov_config.quantization_config
+        if quantization_config is None:
+            continue
+
+        if not is_nncf_available():
+            raise ImportError("Quantization of the weights requires nncf, please install it with `pip install nncf`")
+
+        from optimum.intel.openvino.quantization import _weight_only_quantization
+
+        _weight_only_quantization(submodel, quantization_config)
+
+        compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
+        save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
+        del submodel
+
+        submodel_path.unlink()
+        submodel_path.with_suffix(".bin").unlink()
+        compressed_submodel_path.rename(submodel_path)
+        compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin"))
+
     # Unpatch modules after GPTQ export
     if do_gptq_patching:
         torch.cuda.is_available = orig_cuda_check

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -49,7 +49,6 @@
 from .model_patcher import patch_model_with_bettertransformer
 from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful
 from .utils import (
-    _MAX_UNCOMPRESSED_SIZE,
     OV_XML_FILE_NAME,
     clear_class_registry,
     flattenize_inputs,
@@ -76,21 +75,7 @@
 
 
 def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
-    compress_to_fp16 = False
-
-    if ov_config is not None:
-        if ov_config.quantization_config:
-            if not is_nncf_available():
-                raise ImportError(
-                    "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
-                )
-
-            from optimum.intel.openvino.quantization import _weight_only_quantization
-
-            _weight_only_quantization(model, ov_config.quantization_config)
-
-        compress_to_fp16 = ov_config.dtype == "fp16"
-
+    compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
     model = _add_version_info_to_model(model, library_name)
     save_model(model, path, compress_to_fp16)
 
@@ -643,25 +628,6 @@ def export_from_model(
     )
     logging.disable(logging.NOTSET)
 
-    if ov_config is None:
-        if library_name == "diffusers":
-            num_parameters = model.unet.num_parameters()
-        else:
-            num_parameters = sum(param.numel() for param in list(model.parameters()) if param.requires_grad)
-
-        if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
-            if is_nncf_available():
-                from ...intel.openvino.configuration import OVConfig
-
-                ov_config = OVConfig(quantization_config={"bits": 8, "sym": False})
-
-                logger.info("The model weights will be quantized to int8_asym.")
-            else:
-                logger.warning(
-                    "The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
-                    "please install it with `pip install nncf`"
-                )
-
     if library_name != "diffusers":
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
@@ -720,6 +686,8 @@ def export_from_model(
         patch_16bit_model=patch_16bit_model,
     )
 
+    return files_subpaths
+
 
 def export_tokenizer(
     tokenizer,

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -54,6 +54,7 @@
     CodeGenModelPatcher,
     DBRXModelPatcher,
     FalconModelPatcher,
+    Gemma2ModelPatcher,
     GptNeoxJapaneseModelPatcher,
     GptNeoxModelPatcher,
     InternLM2Patcher,
@@ -997,3 +998,23 @@ def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
         return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
+@register_in_tasks_manager(
+    "gemma2",
+    *[
+        "feature-extraction",
+        "feature-extraction-with-past",
+        "text-generation",
+        "text-generation-with-past",
+        "text-classification",
+    ],
+    library_name="transformers",
+)
+class Gemma2OpenVINOConfig(GemmaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.43.0")
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs)
-Original file line number
+Diff line change
@@ Expand Up / @@ -55,6 +55,7 @@ Here is the list of the supported architectures : @@
     - GPT-NeoX
     - GPT-NeoX-Japanese
     - Gemma
+    - Gemma2
     - Hubert
     - IBert
     - InternLM
@@ Expand Down @@