Skip to content

Commit

Permalink
Merge branch 'main' into wangchang/inc_woq
Browse files Browse the repository at this point in the history
  • Loading branch information
changwangss authored Sep 5, 2024
2 parents 6eba7c4 + 40194a0 commit 2683608
Show file tree
Hide file tree
Showing 31 changed files with 509 additions and 569 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test_ipex.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
fail-fast: false
matrix:
python-version: [3.9]
transformers-version: ["4.39.0", "4.43.*"]
transformers-version: ["4.39.0", "4.44.*"]
ipex-version: ["2.2.0", "2.3.*"]
include:
- python-version: 3.8
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/test_openvino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
fail-fast: false
matrix:
python-version: ["3.8", "3.12"]
transformers-version: ["4.36.0", "4.43.*"]
transformers-version: ["4.36.0", "4.44.*"]
os: [ubuntu-latest]

runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -51,7 +51,6 @@ jobs:
pytest tests/openvino/test_modeling_basic.py
- name: Test openvino-nightly
run: |
pip uninstall -y openvino
pip install openvino-nightly
pip install -U --pre openvino openvino-tokenizers --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
optimum-cli export openvino -m hf-internal-testing/tiny-random-gpt2 gpt2-ov
2 changes: 1 addition & 1 deletion .github/workflows/test_openvino_basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
# This also ensures that the test fails if dependencies break for Python 3.7
python-version: ["3.8", "3.12"]
os: ["ubuntu-22.04", "windows-latest"]
transformers-version: ["4.43.*"]
transformers-version: ["4.44.*"]
include:
- python-version: "3.12"
os: "ubuntu-22.04"
Expand Down
9 changes: 5 additions & 4 deletions docker/Dockerfile.intel
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
libpng-dev \
python3 \
python3-pip \
python3-dev \
libnuma-dev \
&& rm -rf /var/lib/apt/lists/*"
RUN /usr/sbin/update-ccache-symlinks
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
Expand All @@ -43,12 +45,11 @@ RUN python3 -m pip install --no-cache-dir \
torchaudio==${TORCHAUDIO_VERSION} \
-f https://download.pytorch.org/whl/torch_stable.html && \
python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
python3 -m pip install --no-cache-dir numa

ARG OMP_NUM_THREADS=1
ENV OMP_NUM_THREADS=${OMP_NUM_THREADS}
ARG KMP_BLOCKTIME=1
ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
ARG KMP_HW_SUBSET=1T
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
ENV LD_PRELOAD="/usr/local/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
1 change: 1 addition & 0 deletions docs/source/openvino/models.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Here is the list of the supported architectures :
- GPT-NeoX
- GPT-NeoX-Japanese
- Gemma
- Gemma2
- Hubert
- IBert
- InternLM
Expand Down
19 changes: 6 additions & 13 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
optional_group.add_argument(
"--weight-format",
type=str,
choices=["fp32", "fp16", "int8", "int4", "int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"],
choices=["fp32", "fp16", "int8", "int4", "mxfp4"],
default=None,
help="he weight format of the exported model.",
help="The weight format of the exported model.",
)
optional_group.add_argument(
"--library",
Expand Down Expand Up @@ -255,12 +255,11 @@ def run(self):
elif self.args.weight_format in {"fp16", "fp32"}:
ov_config = OVConfig(dtype=self.args.weight_format)
else:
is_int8 = self.args.weight_format == "int8"

# For int4 quantization if no parameter is provided, then use the default config if exist
if no_compression_parameter_provided(self.args) and not is_int8:
# For int4 quantization if no parameter is provided, then use the default config if exists
if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4":
quantization_config = get_default_int4_config(self.args.model)
else:
is_int8 = self.args.weight_format == "int8"
quantization_config = {
"bits": 8 if is_int8 else 4,
"ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
Expand All @@ -272,17 +271,11 @@ def run(self):
"quant_method": "awq" if self.args.awq else "default",
"sensitivity_metric": self.args.sensitivity_metric,
"scale_estimation": self.args.scale_estimation,
"weight_format": self.args.weight_format,
}

if quantization_config.get("dataset", None) is not None:
quantization_config["trust_remote_code"] = self.args.trust_remote_code

if self.args.weight_format in {"int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"}:
logger.warning(
f"--weight-format {self.args.weight_format} is deprecated, possible choices are fp32, fp16, int8, int4"
)
quantization_config["sym"] = "asym" not in self.args.weight_format
quantization_config["group_size"] = 128 if "128" in self.args.weight_format else 64
ov_config = OVConfig(quantization_config=quantization_config)

quantization_config = ov_config.quantization_config if ov_config else None
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/ipex/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

# Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
_TRANSFORMERS_MIN_VERSION = "4.39.0"
_TRANSFORMERS_MAX_VERSION = "4.43.99"
_TRANSFORMERS_MAX_VERSION = "4.44.99"

_IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)

Expand Down
88 changes: 48 additions & 40 deletions optimum/exporters/openvino/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

import gc
import logging
import operator
import warnings
from functools import reduce
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union

Expand All @@ -23,18 +25,20 @@
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
from transformers.utils import is_torch_available

from openvino.runtime import Core, Type, save_model
from optimum.exporters import TasksManager
from optimum.exporters.onnx.base import OnnxConfig
from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
from optimum.exporters.openvino.convert import export_from_model
from optimum.intel.utils.import_utils import (
is_nncf_available,
is_openvino_tokenizers_available,
is_openvino_version,
is_transformers_version,
)
from optimum.utils.save_utils import maybe_load_preprocessors

from .utils import clear_class_registry
from .utils import _MAX_UNCOMPRESSED_SIZE, clear_class_registry


if TYPE_CHECKING:
Expand All @@ -45,15 +49,6 @@
import torch


_COMPRESSION_OPTIONS = {
"int8": {"bits": 8},
"int4_sym_g128": {"bits": 4, "sym": True, "group_size": 128},
"int4_asym_g128": {"bits": 4, "sym": False, "group_size": 128},
"int4_sym_g64": {"bits": 4, "sym": True, "group_size": 64},
"int4_asym_g64": {"bits": 4, "sym": False, "group_size": 64},
}


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -104,8 +99,6 @@ def main_export(
model_kwargs: Optional[Dict[str, Any]] = None,
custom_export_configs: Optional[Dict[str, "OnnxConfig"]] = None,
fn_get_submodels: Optional[Callable] = None,
compression_option: Optional[str] = None,
compression_ratio: Optional[float] = None,
ov_config: "OVConfig" = None,
stateful: bool = True,
convert_tokenizer: bool = False,
Expand Down Expand Up @@ -167,11 +160,6 @@ def main_export(
fn_get_submodels (`Optional[Callable]`, defaults to `None`):
Experimental usage: Override the default submodels that are used at the export. This is
especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success.
compression_option (`Optional[str]`, defaults to `None`):
The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point,
`int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression.
compression_ratio (`Optional[float]`, defaults to `None`):
Compression ratio between primary and backup precision (only relevant to INT4).
stateful (`bool`, defaults to `True`):
Produce stateful model where all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. Applicable only for decoder models.
**kwargs_shapes (`Dict`):
Expand All @@ -194,28 +182,6 @@ def main_export(
raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
token = use_auth_token

if compression_option is not None:
logger.warning(
"The `compression_option` argument is deprecated and will be removed in optimum-intel v1.17.0. "
"Please, pass an `ov_config` argument instead `OVConfig(..., quantization_config=quantization_config)`."
)

if compression_ratio is not None:
logger.warning(
"The `compression_ratio` argument is deprecated and will be removed in optimum-intel v1.17.0. "
"Please, pass an `ov_config` argument instead `OVConfig(quantization_config={ratio=compression_ratio})`."
)

if ov_config is None and compression_option is not None:
from ...intel.openvino.configuration import OVConfig

if compression_option == "fp16":
ov_config = OVConfig(dtype="fp16")
elif compression_option != "fp32":
q_config = _COMPRESSION_OPTIONS[compression_option] if compression_option in _COMPRESSION_OPTIONS else {}
q_config["ratio"] = compression_ratio or 1.0
ov_config = OVConfig(quantization_config=q_config)

original_task = task
task = infer_task(
task, model_name_or_path, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token
Expand Down Expand Up @@ -402,7 +368,7 @@ class StoreAttr(object):
model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
)

export_from_model(
submodel_paths = export_from_model(
model=model,
output=output,
task=task,
Expand All @@ -425,6 +391,48 @@ class StoreAttr(object):
del model
gc.collect()

core = Core()
for submodel_path in submodel_paths:
submodel_path = Path(output) / submodel_path
submodel = core.read_model(submodel_path)

quantization_config = None
if ov_config is None:
num_parameters = 0
for op in submodel.get_ops():
if op.get_type_name() == "Constant" and op.get_element_type() in [Type.f16, Type.f32, Type.bf16]:
num_parameters += reduce(operator.mul, op.shape, 1)
if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
if is_nncf_available():
quantization_config = {"bits": 8, "sym": False}
logger.info("The model weights will be quantized to int8_asym.")
else:
logger.warning(
"The model will be converted with no weights quantization. Quantization of the weights to int8 "
"requires nncf. Please install it with `pip install nncf`"
)
break
else:
quantization_config = ov_config.quantization_config
if quantization_config is None:
continue

if not is_nncf_available():
raise ImportError("Quantization of the weights requires nncf, please install it with `pip install nncf`")

from optimum.intel.openvino.quantization import _weight_only_quantization

_weight_only_quantization(submodel, quantization_config)

compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
del submodel

submodel_path.unlink()
submodel_path.with_suffix(".bin").unlink()
compressed_submodel_path.rename(submodel_path)
compressed_submodel_path.with_suffix(".bin").rename(submodel_path.with_suffix(".bin"))

# Unpatch modules after GPTQ export
if do_gptq_patching:
torch.cuda.is_available = orig_cuda_check
Expand Down
38 changes: 3 additions & 35 deletions optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
from .model_patcher import patch_model_with_bettertransformer
from .stateful import ensure_export_task_support_stateful, ensure_stateful_is_available, patch_stateful
from .utils import (
_MAX_UNCOMPRESSED_SIZE,
OV_XML_FILE_NAME,
clear_class_registry,
flattenize_inputs,
Expand All @@ -76,21 +75,7 @@


def _save_model(model, path: str, ov_config: Optional["OVConfig"] = None, library_name: Optional[str] = None):
compress_to_fp16 = False

if ov_config is not None:
if ov_config.quantization_config:
if not is_nncf_available():
raise ImportError(
"Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`"
)

from optimum.intel.openvino.quantization import _weight_only_quantization

_weight_only_quantization(model, ov_config.quantization_config)

compress_to_fp16 = ov_config.dtype == "fp16"

compress_to_fp16 = ov_config is not None and ov_config.dtype == "fp16"
model = _add_version_info_to_model(model, library_name)
save_model(model, path, compress_to_fp16)

Expand Down Expand Up @@ -643,25 +628,6 @@ def export_from_model(
)
logging.disable(logging.NOTSET)

if ov_config is None:
if library_name == "diffusers":
num_parameters = model.unet.num_parameters()
else:
num_parameters = sum(param.numel() for param in list(model.parameters()) if param.requires_grad)

if num_parameters >= _MAX_UNCOMPRESSED_SIZE:
if is_nncf_available():
from ...intel.openvino.configuration import OVConfig

ov_config = OVConfig(quantization_config={"bits": 8, "sym": False})

logger.info("The model weights will be quantized to int8_asym.")
else:
logger.warning(
"The model will be converted with no weights quantization. Quantization of the weights to int8 requires nncf."
"please install it with `pip install nncf`"
)

if library_name != "diffusers":
# Saving the model config and preprocessor as this is needed sometimes.
model.config.save_pretrained(output)
Expand Down Expand Up @@ -720,6 +686,8 @@ def export_from_model(
patch_16bit_model=patch_16bit_model,
)

return files_subpaths


def export_tokenizer(
tokenizer,
Expand Down
21 changes: 21 additions & 0 deletions optimum/exporters/openvino/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
CodeGenModelPatcher,
DBRXModelPatcher,
FalconModelPatcher,
Gemma2ModelPatcher,
GptNeoxJapaneseModelPatcher,
GptNeoxModelPatcher,
InternLM2Patcher,
Expand Down Expand Up @@ -997,3 +998,23 @@ def patch_model_for_export(
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
) -> "ModelPatcher":
return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs)


@register_in_tasks_manager(
"gemma2",
*[
"feature-extraction",
"feature-extraction-with-past",
"text-generation",
"text-generation-with-past",
"text-classification",
],
library_name="transformers",
)
class Gemma2OpenVINOConfig(GemmaOnnxConfig):
MIN_TRANSFORMERS_VERSION = version.parse("4.43.0")

def patch_model_for_export(
self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
) -> "ModelPatcher":
return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs)
Loading

0 comments on commit 2683608

Please sign in to comment.