Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve UT coverage of PT Utils and Quantization #1842

Merged
merged 12 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ neural-compressor
intel-extension-for-transformers
lm_eval==0.4.2
peft
auto_round
intel_extension_for_pytorch
Original file line number Diff line number Diff line change
Expand Up @@ -236,9 +236,9 @@ def get_user_model():
# 3.x api
if args.approach == 'weight_only':
from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize
from neural_compressor.torch.utils import get_double_quant_config
from neural_compressor.torch.utils import get_double_quant_config_dict
weight_sym = True if args.woq_scheme == "sym" else False
double_quant_config_dict = get_double_quant_config(args.double_quant_type)
double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type)

if args.woq_algo == "RTN":
if args.double_quant_type is not None:
Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/torch/algorithms/pt2e_quant/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from neural_compressor.common.utils import logger
from neural_compressor.torch.algorithms.base_algorithm import Quantizer
from neural_compressor.torch.algorithms.pt2e_quant import half_precision_rewriter as hp_rewriter
from neural_compressor.torch.utils import create_xiq_quantizer_from_pt2e_config
from neural_compressor.torch.algorithms.pt2e_quant.utility import create_xiq_quantizer_from_pt2e_config


class W8A8PT2EQuantizer(Quantizer):
Expand Down
79 changes: 79 additions & 0 deletions neural_compressor/torch/algorithms/pt2e_quant/utility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict

import torch
import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver
from torch.ao.quantization.quantizer import QuantizationSpec
from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer


def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec:
dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8}
select_dtype = dtype_mapping[dtype]
min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)}
qscheme_mapping = {
"per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine},
"per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine},
}
observer_mapping = {
"placeholder": PlaceholderObserver,
"minmax": MinMaxObserver,
"kl": HistogramObserver,
}
# Force to use placeholder observer for dynamic quantization
if is_dynamic:
algo = "placeholder"
# algo
observer_or_fake_quant_ctr = observer_mapping[algo]
# qscheme
qscheme = qscheme_mapping[granularity][sym]
quantization_spec = QuantizationSpec(
dtype=select_dtype,
quant_min=min_max_mapping[select_dtype][0],
quant_max=min_max_mapping[select_dtype][1],
observer_or_fake_quant_ctr=observer_or_fake_quant_ctr,
qscheme=qscheme,
is_dynamic=is_dynamic,
)
return quantization_spec


def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig:
default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic)
input_act_quant_spec = create_quant_spec_from_config(
inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic
)
weight_quant_spec = create_quant_spec_from_config(
inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo
)
quant_config = QuantizationConfig(
input_activation=input_act_quant_spec,
output_activation=default_quant_config.output_activation,
weight=weight_quant_spec,
bias=default_quant_config.bias,
is_qat=False,
)
return quant_config


def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer:
quantizer = xiq.X86InductorQuantizer()
# set global
global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic)
quantizer.set_global(global_config)
# Skip the local config for now (need torch 2.4)
return quantizer
18 changes: 12 additions & 6 deletions neural_compressor/torch/utils/auto_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class CUDA_Accelerator:
return accelerator_registry.register_accelerator_impl(name=name, priority=priority)


class Auto_Accelerator(ABC):
class Auto_Accelerator(ABC): # pragma: no cover
@classmethod
@abstractmethod
def is_available(cls) -> bool:
Expand Down Expand Up @@ -175,7 +175,7 @@ def synchronize(self):


@register_accelerator(name="cuda", priority=PRIORITY_CUDA)
class CUDA_Accelerator(Auto_Accelerator):
class CUDA_Accelerator(Auto_Accelerator): # pragma: no cover
def __init__(self) -> None:
self._name = "cuda"

Expand Down Expand Up @@ -211,7 +211,7 @@ def empty_cache(self):


@register_accelerator(name="xpu", priority=PRIORITY_XPU)
class XPU_Accelerator(Auto_Accelerator):
class XPU_Accelerator(Auto_Accelerator): # pragma: no cover
def __init__(self) -> None:
self._name = "xpu"

Expand Down Expand Up @@ -250,7 +250,7 @@ def empty_cache(self):


@register_accelerator(name="hpu", priority=PRIORITY_HPU)
class HPU_Accelerator(Auto_Accelerator):
class HPU_Accelerator(Auto_Accelerator): # pragma: no cover
def __init__(self) -> None:
self._name = "hpu"

Expand All @@ -275,7 +275,10 @@ def synchronize(self):
return torch.hpu.synchronize()

def set_device(self, device_index):
return torch.hpu.set_device(device_index)
try:
torch.hpu.set_device(device_index)
except Exception as e:
logger.warning(e)

def current_device(self):
return torch.hpu.current_device()
Expand All @@ -287,7 +290,10 @@ def device(self, device_index=None):
return torch.hpu.device(device_index)

def empty_cache(self):
return torch.hpu.empty_cache()
try:
torch.hpu.empty_cache()
except Exception as e:
logger.warning(e)

def mark_step(self):
return htcore.mark_step()
Expand Down
40 changes: 25 additions & 15 deletions neural_compressor/torch/utils/environ.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import importlib
import sys

import torch
from packaging.version import Version

# pylint:disable=import-error
try:
import habana_frameworks.torch.hpex

_hpex_available = True
except:
_hpex_available = False


def is_hpex_available():
return _hpex_available


################ Check imported sys.module first to decide behavior #################
def is_ipex_imported() -> bool:
for name, _ in sys.modules.items():
if name == "intel_extension_for_pytorch":
Expand All @@ -45,11 +35,29 @@ def is_transformers_imported() -> bool:
return False


try:
import intel_extension_for_pytorch as ipex
################ Check available sys.module to decide behavior #################
def is_package_available(package_name):
from importlib.util import find_spec

package_spec = find_spec(package_name)
return package_spec is not None


## check hpex
if is_package_available("habana_frameworks"):
_hpex_available = True
else:
_hpex_available = False


def is_hpex_available():
return _hpex_available


## check ipex
if is_package_available("intel_extension_for_pytorch"):
_ipex_available = True
except:
else:
_ipex_available = False


Expand All @@ -60,6 +68,8 @@ def is_ipex_available():
def get_ipex_version():
if is_ipex_available():
try:
import intel_extension_for_pytorch as ipex

ipex_version = ipex.__version__.split("+")[0]
except ValueError as e: # pragma: no cover
assert False, "Got an unknown version of intel_extension_for_pytorch: {}".format(e)
Expand Down
66 changes: 1 addition & 65 deletions neural_compressor/torch/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@
from typing import Callable, Dict, List, Tuple, Union

import torch
import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver
from torch.ao.quantization.quantizer import QuantizationSpec
from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer
from typing_extensions import TypeAlias

from neural_compressor.common import logger
Expand Down Expand Up @@ -120,11 +116,9 @@ def get_model_info(model: torch.nn.Module, white_module_list: List[Callable]) ->
return filter_result


def get_double_quant_config(double_quant_type):
def get_double_quant_config_dict(double_quant_type="BNB_NF4"):
from neural_compressor.torch.utils.constants import DOUBLE_QUANT_CONFIGS

if double_quant_type is None:
return {}
assert double_quant_type in DOUBLE_QUANT_CONFIGS, "Supported double quant configs: {}".format(
list(DOUBLE_QUANT_CONFIGS.keys())
)
Expand Down Expand Up @@ -170,61 +164,3 @@ def postprocess_model(model, mode, quantizer):
elif mode == Mode.CONVERT or mode == Mode.QUANTIZE:
if getattr(model, "quantizer", False):
del model.quantizer


def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec:
dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8}
select_dtype = dtype_mapping[dtype]
min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)}
qscheme_mapping = {
"per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine},
"per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine},
}
observer_mapping = {
"placeholder": PlaceholderObserver,
"minmax": MinMaxObserver,
"kl": HistogramObserver,
}
# Force to use placeholder observer for dynamic quantization
if is_dynamic:
algo = "placeholder"
# algo
observer_or_fake_quant_ctr = observer_mapping[algo]
# qscheme
qscheme = qscheme_mapping[granularity][sym]
quantization_spec = QuantizationSpec(
dtype=select_dtype,
quant_min=min_max_mapping[select_dtype][0],
quant_max=min_max_mapping[select_dtype][1],
observer_or_fake_quant_ctr=observer_or_fake_quant_ctr,
qscheme=qscheme,
is_dynamic=is_dynamic,
)
return quantization_spec


def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig:
default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic)
input_act_quant_spec = create_quant_spec_from_config(
inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic
)
weight_quant_spec = create_quant_spec_from_config(
inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo
)
quant_config = QuantizationConfig(
input_activation=input_act_quant_spec,
output_activation=default_quant_config.output_activation,
weight=weight_quant_spec,
bias=default_quant_config.bias,
is_qat=False,
)
return quant_config


def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer:
quantizer = xiq.X86InductorQuantizer()
# set global
global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic)
quantizer.set_global(global_config)
# Skip the local config for now (need torch 2.4)
return quantizer
3 changes: 1 addition & 2 deletions requirements_pt.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
auto-round
intel_extension_for_pytorch
numpy
peft==0.10.0
psutil
py-cpuinfo
Expand Down
4 changes: 3 additions & 1 deletion test/3x/torch/quantization/weight_only/test_rtn.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
prepare,
quantize,
)
from neural_compressor.torch.utils import accelerator
from neural_compressor.torch.utils import accelerator, is_hpex_available

device = accelerator.current_device_name()

Expand Down Expand Up @@ -76,6 +76,8 @@ def test_int_params(self, bits, use_sym, group_size, group_dim):
model = convert(model)
out = model(self.example_inputs)[0]
assert (out != self.label).any(), "WOQ output should be different with raw output"
if is_hpex_available():
assert "hpu" in out.device, "Neural Compressor should run on HPU when HPEX is available."
if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1):
assert torch.allclose(out, self.label, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
if (bits, use_sym, group_size, group_dim) == [(4, True, 128, 0), (4, True, 32, 1)]:
Expand Down
Loading
Loading