diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 67af0d3497..764526f6ed 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -14,7 +14,6 @@ import logging import os -import types import warnings from pathlib import Path from tempfile import TemporaryDirectory @@ -25,7 +24,6 @@ from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.utils import EntryNotFoundError from neural_compressor.transformers import GPTQConfig, RtnConfig -from neural_compressor.transformers.quantization import convert_to_quantized_model, save_low_bit from neural_compressor.utils.pytorch import load from transformers import ( AutoConfig, @@ -52,11 +50,10 @@ from ...modeling_base import OptimizedModel from ..utils.import_utils import ( _torch_version, - is_ipex_version, - is_neural_compressor_version, is_torch_version, ) from .configuration import INCConfig +from .quantization import weight_only_quantization from .utils import QUANTIZATION_CONFIG_NAME @@ -124,19 +121,10 @@ def _from_pretrained( local_files_only: bool = False, subfolder: str = "", trust_remote_code: bool = False, - *model_args, **kwargs, ): - device_map = kwargs.get("device_map", "cpu") - use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False - quantization_config = kwargs.pop("quantization_config", None) - if not isinstance(config, PretrainedConfig): - config, _ = AutoConfig.from_pretrained( - model_id, - return_unused_kwargs=True, - **kwargs, - ) + if hasattr(config, "quantization_config"): if config.quantization_config is None: logger.warning( @@ -167,75 +155,25 @@ def _from_pretrained( logger.info("Saved low bit model loading successfully. Other input args " "will be ignored.") return model except Exception as e: - logger.error(e) - logger.error("Saved low bit model loading failed, please check your model.") - exit(0) + raise RuntimeError(f"The quantized model cannot be loaded. Detailed error: {e}") if isinstance(quantization_config, (RtnConfig, GPTQConfig)): - logger.info("Applying Weight Only Quantization.") - warnings.warn( - "Weight only quantization provided by intel_extension_for_transformers is deprecated and it is provided by INC now.", - DeprecationWarning, + model = weight_only_quantization( + cls.auto_model_class, + model_id, + quantization_config=quantization_config, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + trust_remote_code=trust_remote_code, + config=config, + **kwargs, ) - if is_neural_compressor_version("<=", "3.0"): - raise AssertionError("Please use neural_compressor version > 3.0.") - if is_ipex_version("<", "2.3.1") and use_xpu: - raise AssertionError("Please use intel_extension_for_pytorch version >= 2.3.1.") - - if use_xpu: - # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. - kwargs["low_cpu_mem_usage"] = True - kwargs["device_map"] = "cpu" - try: - model = cls.auto_model_class.from_pretrained( - model_id, - *model_args, - config=config, - **kwargs, - ) - model.config.update({"low_cpu_mem_usage": True}) - except NotImplementedError: - logger.info( - "Failed to load models with `low_cpu_mem_usage` specified, " - "will fall to traditional load method with higher memory consumption." - ) - kwargs["low_cpu_mem_usage"] = False - model = cls.auto_model_class.from_pretrained( - model_id, - *model_args, - config=config, - **kwargs, - ) - model.config.update({"low_cpu_mem_usage": False}) - quantization_config.post_init_xpu() - else: - kwargs["low_cpu_mem_usage"] = True - model = cls.auto_model_class.from_pretrained( - model_id, - *model_args, - config=config, - **kwargs, - ) - model.config.update({"low_cpu_mem_usage": True}) - quantization_config.post_init_cpu() - model.eval() - - if use_xpu: - assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!" - quantization_config.update(**{"device": "xpu"}) - quantization_config.post_init_xpu() - if ( - not torch.cuda.is_available() or device_map == "cpu" or device_map == torch.device("cpu") - ) and model.config.model_type == "chatglm": - model = model.float() - model = convert_to_quantized_model(model, quantization_config, device=device_map) - quantization_config.remove_redundant_parameters() - model.config.quantization_config = quantization_config - # add quantization_config and save_low_bit to pretrained model dynamically - model.device_map = device_map - model.quantization_config = quantization_config - model.save_pretrained = types.MethodType(save_low_bit, model) - logger.info("WeightOnlyQuant done.") + return model + if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 8c08648850..5d53ca6a52 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -15,6 +15,7 @@ import copy import inspect import logging +import types import warnings from enum import Enum from pathlib import Path @@ -22,9 +23,12 @@ import torch from datasets import Dataset, load_dataset +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from neural_compressor.config import PostTrainingQuantConfig from neural_compressor.model.torch_model import IPEXModel, PyTorchModel from neural_compressor.quantization import fit +from neural_compressor.transformers import GPTQConfig, RtnConfig +from neural_compressor.transformers.quantization import convert_to_quantized_model, save_low_bit from torch.utils.data import DataLoader, RandomSampler from transformers import ( DataCollator, @@ -44,16 +48,6 @@ is_neural_compressor_version, ) from .configuration import INCConfig -from .modeling_base import ( # noqa - INCModel, - INCModelForMaskedLM, - INCModelForMultipleChoice, - INCModelForQuestionAnswering, - INCModelForSeq2SeqLM, - INCModelForSequenceClassification, - INCModelForTokenClassification, - INCModelForVision2Seq, -) from .utils import ( IPEX_MINIMUM_VERSION, NEURAL_COMPRESSOR_MINIMUM_VERSION, @@ -339,3 +333,105 @@ def _get_calibration_dataloader( def _remove_unused_columns(self, dataset: Dataset): ignored_columns = list(set(dataset.column_names) - set(self._signature_columns)) return dataset.remove_columns(ignored_columns) + + +def weight_only_quantization( + model_class, + model_id: Union[str, Path], + quantization_config: Union[RtnConfig, GPTQConfig], + config: PretrainedConfig, + use_auth_token: Optional[Union[bool, str]] = None, + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + force_download: bool = False, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + file_name: str = WEIGHTS_NAME, + local_files_only: bool = False, + subfolder: str = "", + trust_remote_code: bool = False, + **kwargs, +): + device_map = kwargs.get("device_map", "xpu" if (hasattr(torch, "xpu") and torch.xpu.is_available()) else "cpu") + use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False + + warnings.warn( + "Weight only quantization provided by intel_extension_for_transformers is deprecated and it is provided by INC now.", + DeprecationWarning, + ) + if is_neural_compressor_version("<=", "3.0"): + raise AssertionError("Please use neural_compressor version > 3.0.") + if is_ipex_version("<", "2.3.1") and use_xpu: + raise AssertionError("Please use intel_extension_for_pytorch version >= 2.3.1.") + + if use_xpu: + # TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device. + kwargs["low_cpu_mem_usage"] = True + kwargs["device_map"] = "cpu" + try: + model = model_class.from_pretrained( + model_id, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + trust_remote_code=trust_remote_code, + config=config, + **kwargs, + ) + model.config.update({"low_cpu_mem_usage": True}) + except NotImplementedError: + logger.info( + "Failed to load models with `low_cpu_mem_usage` specified, " + "will fall to traditional load method with higher memory consumption." + ) + kwargs["low_cpu_mem_usage"] = False + model = model_class.from_pretrained( + model_id, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + trust_remote_code=trust_remote_code, + config=config, + **kwargs, + ) + model.config.update({"low_cpu_mem_usage": False}) + quantization_config.post_init_xpu() + else: + kwargs["low_cpu_mem_usage"] = True + model = model_class.from_pretrained( + model_id, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + trust_remote_code=trust_remote_code, + config=config, + **kwargs, + ) + model.config.update({"low_cpu_mem_usage": True}) + quantization_config.post_init_cpu() + model.eval() + + if use_xpu: + assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!" + quantization_config.update(**{"device": "xpu"}) + quantization_config.post_init_xpu() + if ( + not torch.cuda.is_available() or device_map == "cpu" or device_map == torch.device("cpu") + ) and model.config.model_type == "chatglm": + model = model.float() + model = convert_to_quantized_model(model, quantization_config, device=device_map) + quantization_config.remove_redundant_parameters() + model.config.quantization_config = quantization_config + # add quantization_config and save_low_bit to pretrained model dynamically + model.device_map = device_map + model.quantization_config = quantization_config + model.save_pretrained = types.MethodType(save_low_bit, model) + return model diff --git a/setup.py b/setup.py index 8076ddf2e1..2e4df54e62 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,7 @@ QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] EXTRAS_REQUIRE = { - "neural-compressor": ["neural-compressor>3.0", "accelerate", "transformers<4.43.0"], + "neural-compressor": ["neural-compressor>3.0", "accelerate", "transformers"], "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.11.0"], "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"], diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index 6efd9b543a..38b6d62f90 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -21,7 +21,6 @@ import torch from parameterized import parameterized from transformers import AutoTokenizer, pipeline, set_seed -from transformers.utils import SAFE_WEIGHTS_NAME from optimum.exporters import TasksManager from optimum.intel import ( # noqa @@ -39,7 +38,7 @@ INCStableDiffusionPipeline, INCTrainer, ) -from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, QUANTIZATION_CONFIG_NAME, WEIGHTS_NAME +from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME os.environ["CUDA_VISIBLE_DEVICES"] = ""