Skip to content

Commit

Permalink
move woq quantization to quantization.py
Browse files Browse the repository at this point in the history
Signed-off-by: changwangss <[email protected]>
  • Loading branch information
changwangss committed Sep 5, 2024
1 parent d55004b commit fcadbac
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 93 deletions.
98 changes: 18 additions & 80 deletions optimum/intel/neural_compressor/modeling_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

import logging
import os
import types
import warnings
from pathlib import Path
from tempfile import TemporaryDirectory
Expand All @@ -25,7 +24,6 @@
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from huggingface_hub.utils import EntryNotFoundError
from neural_compressor.transformers import GPTQConfig, RtnConfig
from neural_compressor.transformers.quantization import convert_to_quantized_model, save_low_bit
from neural_compressor.utils.pytorch import load
from transformers import (
AutoConfig,
Expand All @@ -52,11 +50,10 @@
from ...modeling_base import OptimizedModel
from ..utils.import_utils import (
_torch_version,
is_ipex_version,
is_neural_compressor_version,
is_torch_version,
)
from .configuration import INCConfig
from .quantization import weight_only_quantization
from .utils import QUANTIZATION_CONFIG_NAME


Expand Down Expand Up @@ -124,19 +121,10 @@ def _from_pretrained(
local_files_only: bool = False,
subfolder: str = "",
trust_remote_code: bool = False,
*model_args,
**kwargs,
):
device_map = kwargs.get("device_map", "cpu")
use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False

quantization_config = kwargs.pop("quantization_config", None)
if not isinstance(config, PretrainedConfig):
config, _ = AutoConfig.from_pretrained(
model_id,
return_unused_kwargs=True,
**kwargs,
)

if hasattr(config, "quantization_config"):
if config.quantization_config is None:
logger.warning(
Expand Down Expand Up @@ -167,75 +155,25 @@ def _from_pretrained(
logger.info("Saved low bit model loading successfully. Other input args " "will be ignored.")
return model
except Exception as e:
logger.error(e)
logger.error("Saved low bit model loading failed, please check your model.")
exit(0)
raise RuntimeError(f"The quantized model cannot be loaded. Detailed error: {e}")
if isinstance(quantization_config, (RtnConfig, GPTQConfig)):
logger.info("Applying Weight Only Quantization.")
warnings.warn(
"Weight only quantization provided by intel_extension_for_transformers is deprecated and it is provided by INC now.",
DeprecationWarning,
model = weight_only_quantization(
cls.auto_model_class,
model_id,
quantization_config=quantization_config,
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
token=token,
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
config=config,
**kwargs,
)
if is_neural_compressor_version("<=", "3.0"):
raise AssertionError("Please use neural_compressor version > 3.0.")
if is_ipex_version("<", "2.3.1") and use_xpu:
raise AssertionError("Please use intel_extension_for_pytorch version >= 2.3.1.")

if use_xpu:
# TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device.
kwargs["low_cpu_mem_usage"] = True
kwargs["device_map"] = "cpu"
try:
model = cls.auto_model_class.from_pretrained(
model_id,
*model_args,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": True})
except NotImplementedError:
logger.info(
"Failed to load models with `low_cpu_mem_usage` specified, "
"will fall to traditional load method with higher memory consumption."
)
kwargs["low_cpu_mem_usage"] = False
model = cls.auto_model_class.from_pretrained(
model_id,
*model_args,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": False})
quantization_config.post_init_xpu()
else:
kwargs["low_cpu_mem_usage"] = True
model = cls.auto_model_class.from_pretrained(
model_id,
*model_args,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": True})
quantization_config.post_init_cpu()
model.eval()

if use_xpu:
assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!"
quantization_config.update(**{"device": "xpu"})
quantization_config.post_init_xpu()
if (
not torch.cuda.is_available() or device_map == "cpu" or device_map == torch.device("cpu")
) and model.config.model_type == "chatglm":
model = model.float()
model = convert_to_quantized_model(model, quantization_config, device=device_map)
quantization_config.remove_redundant_parameters()
model.config.quantization_config = quantization_config
# add quantization_config and save_low_bit to pretrained model dynamically
model.device_map = device_map
model.quantization_config = quantization_config
model.save_pretrained = types.MethodType(save_low_bit, model)
logger.info("WeightOnlyQuant done.")

return model

if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
Expand Down
116 changes: 106 additions & 10 deletions optimum/intel/neural_compressor/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,20 @@
import copy
import inspect
import logging
import types
import warnings
from enum import Enum
from pathlib import Path
from typing import Callable, Optional, Union

import torch
from datasets import Dataset, load_dataset
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from neural_compressor.config import PostTrainingQuantConfig
from neural_compressor.model.torch_model import IPEXModel, PyTorchModel
from neural_compressor.quantization import fit
from neural_compressor.transformers import GPTQConfig, RtnConfig
from neural_compressor.transformers.quantization import convert_to_quantized_model, save_low_bit
from torch.utils.data import DataLoader, RandomSampler
from transformers import (
DataCollator,
Expand All @@ -44,16 +48,6 @@
is_neural_compressor_version,
)
from .configuration import INCConfig
from .modeling_base import ( # noqa
INCModel,
INCModelForMaskedLM,
INCModelForMultipleChoice,
INCModelForQuestionAnswering,
INCModelForSeq2SeqLM,
INCModelForSequenceClassification,
INCModelForTokenClassification,
INCModelForVision2Seq,
)
from .utils import (
IPEX_MINIMUM_VERSION,
NEURAL_COMPRESSOR_MINIMUM_VERSION,
Expand Down Expand Up @@ -339,3 +333,105 @@ def _get_calibration_dataloader(
def _remove_unused_columns(self, dataset: Dataset):
ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
return dataset.remove_columns(ignored_columns)


def weight_only_quantization(
model_class,
model_id: Union[str, Path],
quantization_config: Union[RtnConfig, GPTQConfig],
config: PretrainedConfig,
use_auth_token: Optional[Union[bool, str]] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
force_download: bool = False,
cache_dir: str = HUGGINGFACE_HUB_CACHE,
file_name: str = WEIGHTS_NAME,
local_files_only: bool = False,
subfolder: str = "",
trust_remote_code: bool = False,
**kwargs,
):
device_map = kwargs.get("device_map", "xpu" if (hasattr(torch, "xpu") and torch.xpu.is_available()) else "cpu")
use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False

warnings.warn(
"Weight only quantization provided by intel_extension_for_transformers is deprecated and it is provided by INC now.",
DeprecationWarning,
)
if is_neural_compressor_version("<=", "3.0"):
raise AssertionError("Please use neural_compressor version > 3.0.")
if is_ipex_version("<", "2.3.1") and use_xpu:
raise AssertionError("Please use intel_extension_for_pytorch version >= 2.3.1.")

if use_xpu:
# TODO: if low_cpu_mem_uasge is True, gptj will have accuracy issue on CPU device.
kwargs["low_cpu_mem_usage"] = True
kwargs["device_map"] = "cpu"
try:
model = model_class.from_pretrained(
model_id,
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
token=token,
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": True})
except NotImplementedError:
logger.info(
"Failed to load models with `low_cpu_mem_usage` specified, "
"will fall to traditional load method with higher memory consumption."
)
kwargs["low_cpu_mem_usage"] = False
model = model_class.from_pretrained(
model_id,
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
token=token,
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": False})
quantization_config.post_init_xpu()
else:
kwargs["low_cpu_mem_usage"] = True
model = model_class.from_pretrained(
model_id,
subfolder=subfolder,
revision=revision,
cache_dir=cache_dir,
token=token,
local_files_only=local_files_only,
force_download=force_download,
trust_remote_code=trust_remote_code,
config=config,
**kwargs,
)
model.config.update({"low_cpu_mem_usage": True})
quantization_config.post_init_cpu()
model.eval()

if use_xpu:
assert hasattr(torch, "xpu") and torch.xpu.is_available(), "There is no xpu device in this system!"
quantization_config.update(**{"device": "xpu"})
quantization_config.post_init_xpu()
if (
not torch.cuda.is_available() or device_map == "cpu" or device_map == torch.device("cpu")
) and model.config.model_type == "chatglm":
model = model.float()
model = convert_to_quantized_model(model, quantization_config, device=device_map)
quantization_config.remove_redundant_parameters()
model.config.quantization_config = quantization_config
# add quantization_config and save_low_bit to pretrained model dynamically
model.device_map = device_map
model.quantization_config = quantization_config
model.save_pretrained = types.MethodType(save_low_bit, model)
return model
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]

EXTRAS_REQUIRE = {
"neural-compressor": ["neural-compressor>3.0", "accelerate", "transformers<4.43.0"],
"neural-compressor": ["neural-compressor>3.0", "accelerate", "transformers"],
"openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
"nncf": ["nncf>=2.11.0"],
"ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"],
Expand Down
3 changes: 1 addition & 2 deletions tests/neural_compressor/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import torch
from parameterized import parameterized
from transformers import AutoTokenizer, pipeline, set_seed
from transformers.utils import SAFE_WEIGHTS_NAME

from optimum.exporters import TasksManager
from optimum.intel import ( # noqa
Expand All @@ -39,7 +38,7 @@
INCStableDiffusionPipeline,
INCTrainer,
)
from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, QUANTIZATION_CONFIG_NAME, WEIGHTS_NAME
from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME


os.environ["CUDA_VISIBLE_DEVICES"] = ""
Expand Down

0 comments on commit fcadbac

Please sign in to comment.