diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt index f0b56e558d3..9688a4f6cb3 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/requirements.txt @@ -11,3 +11,5 @@ neural-compressor intel-extension-for-transformers lm_eval==0.4.2 peft +auto_round +intel_extension_for_pytorch diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 44236584737..5d39cf3a62b 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -236,9 +236,10 @@ def get_user_model(): # 3.x api if args.approach == 'weight_only': from neural_compressor.torch.quantization import RTNConfig, GPTQConfig, prepare, convert, quantize - from neural_compressor.torch.utils import get_double_quant_config + from neural_compressor.torch.utils import get_double_quant_config_dict weight_sym = True if args.woq_scheme == "sym" else False - double_quant_config_dict = get_double_quant_config(args.double_quant_type) + if args.double_quant_type is not None: + double_quant_config_dict = get_double_quant_config_dict(args.double_quant_type) if args.woq_algo == "RTN": if args.double_quant_type is not None: diff --git a/neural_compressor/torch/algorithms/pt2e_quant/core.py b/neural_compressor/torch/algorithms/pt2e_quant/core.py index 129ca6f072a..a1b4d1f65b6 100644 --- a/neural_compressor/torch/algorithms/pt2e_quant/core.py +++ b/neural_compressor/torch/algorithms/pt2e_quant/core.py @@ -26,7 +26,7 @@ from neural_compressor.common.utils import logger from neural_compressor.torch.algorithms.base_algorithm import Quantizer from neural_compressor.torch.algorithms.pt2e_quant import half_precision_rewriter as hp_rewriter -from neural_compressor.torch.utils import create_xiq_quantizer_from_pt2e_config +from neural_compressor.torch.algorithms.pt2e_quant.utility import create_xiq_quantizer_from_pt2e_config class W8A8PT2EQuantizer(Quantizer): diff --git a/neural_compressor/torch/algorithms/pt2e_quant/utility.py b/neural_compressor/torch/algorithms/pt2e_quant/utility.py new file mode 100644 index 00000000000..92635db1f70 --- /dev/null +++ b/neural_compressor/torch/algorithms/pt2e_quant/utility.py @@ -0,0 +1,79 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +import torch +import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq +from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver +from torch.ao.quantization.quantizer import QuantizationSpec +from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer + + +def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec: + dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8} + select_dtype = dtype_mapping[dtype] + min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)} + qscheme_mapping = { + "per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine}, + "per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine}, + } + observer_mapping = { + "placeholder": PlaceholderObserver, + "minmax": MinMaxObserver, + "kl": HistogramObserver, + } + # Force to use placeholder observer for dynamic quantization + if is_dynamic: + algo = "placeholder" + # algo + observer_or_fake_quant_ctr = observer_mapping[algo] + # qscheme + qscheme = qscheme_mapping[granularity][sym] + quantization_spec = QuantizationSpec( + dtype=select_dtype, + quant_min=min_max_mapping[select_dtype][0], + quant_max=min_max_mapping[select_dtype][1], + observer_or_fake_quant_ctr=observer_or_fake_quant_ctr, + qscheme=qscheme, + is_dynamic=is_dynamic, + ) + return quantization_spec + + +def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig: + default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic) + input_act_quant_spec = create_quant_spec_from_config( + inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic + ) + weight_quant_spec = create_quant_spec_from_config( + inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo + ) + quant_config = QuantizationConfig( + input_activation=input_act_quant_spec, + output_activation=default_quant_config.output_activation, + weight=weight_quant_spec, + bias=default_quant_config.bias, + is_qat=False, + ) + return quant_config + + +def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer: + quantizer = xiq.X86InductorQuantizer() + # set global + global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic) + quantizer.set_global(global_config) + # Skip the local config for now (need torch 2.4) + return quantizer diff --git a/neural_compressor/torch/utils/auto_accelerator.py b/neural_compressor/torch/utils/auto_accelerator.py index 91ffc41fcac..be7dd7f842f 100644 --- a/neural_compressor/torch/utils/auto_accelerator.py +++ b/neural_compressor/torch/utils/auto_accelerator.py @@ -98,7 +98,7 @@ class CUDA_Accelerator: return accelerator_registry.register_accelerator_impl(name=name, priority=priority) -class Auto_Accelerator(ABC): +class Auto_Accelerator(ABC): # pragma: no cover @classmethod @abstractmethod def is_available(cls) -> bool: @@ -175,7 +175,7 @@ def synchronize(self): @register_accelerator(name="cuda", priority=PRIORITY_CUDA) -class CUDA_Accelerator(Auto_Accelerator): +class CUDA_Accelerator(Auto_Accelerator): # pragma: no cover def __init__(self) -> None: self._name = "cuda" @@ -211,7 +211,7 @@ def empty_cache(self): @register_accelerator(name="xpu", priority=PRIORITY_XPU) -class XPU_Accelerator(Auto_Accelerator): +class XPU_Accelerator(Auto_Accelerator): # pragma: no cover def __init__(self) -> None: self._name = "xpu" @@ -250,7 +250,7 @@ def empty_cache(self): @register_accelerator(name="hpu", priority=PRIORITY_HPU) -class HPU_Accelerator(Auto_Accelerator): +class HPU_Accelerator(Auto_Accelerator): # pragma: no cover def __init__(self) -> None: self._name = "hpu" @@ -275,7 +275,10 @@ def synchronize(self): return torch.hpu.synchronize() def set_device(self, device_index): - return torch.hpu.set_device(device_index) + try: + torch.hpu.set_device(device_index) + except Exception as e: + logger.warning(e) def current_device(self): return torch.hpu.current_device() @@ -287,7 +290,10 @@ def device(self, device_index=None): return torch.hpu.device(device_index) def empty_cache(self): - return torch.hpu.empty_cache() + try: + torch.hpu.empty_cache() + except Exception as e: + logger.warning(e) def mark_step(self): return htcore.mark_step() diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py index 611ab5fda15..3091aa83d88 100644 --- a/neural_compressor/torch/utils/environ.py +++ b/neural_compressor/torch/utils/environ.py @@ -13,24 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib import sys import torch from packaging.version import Version -# pylint:disable=import-error -try: - import habana_frameworks.torch.hpex - - _hpex_available = True -except: - _hpex_available = False - - -def is_hpex_available(): - return _hpex_available - +################ Check imported sys.module first to decide behavior ################# def is_ipex_imported() -> bool: for name, _ in sys.modules.items(): if name == "intel_extension_for_pytorch": @@ -45,11 +35,29 @@ def is_transformers_imported() -> bool: return False -try: - import intel_extension_for_pytorch as ipex +################ Check available sys.module to decide behavior ################# +def is_package_available(package_name): + from importlib.util import find_spec + + package_spec = find_spec(package_name) + return package_spec is not None + +## check hpex +if is_package_available("habana_frameworks"): + _hpex_available = True +else: + _hpex_available = False + + +def is_hpex_available(): + return _hpex_available + + +## check ipex +if is_package_available("intel_extension_for_pytorch"): _ipex_available = True -except: +else: _ipex_available = False @@ -60,6 +68,8 @@ def is_ipex_available(): def get_ipex_version(): if is_ipex_available(): try: + import intel_extension_for_pytorch as ipex + ipex_version = ipex.__version__.split("+")[0] except ValueError as e: # pragma: no cover assert False, "Got an unknown version of intel_extension_for_pytorch: {}".format(e) diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index f88c768cfed..95db23711cf 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -16,10 +16,6 @@ from typing import Callable, Dict, List, Tuple, Union import torch -import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq -from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver, PlaceholderObserver -from torch.ao.quantization.quantizer import QuantizationSpec -from torch.ao.quantization.quantizer.x86_inductor_quantizer import QuantizationConfig, X86InductorQuantizer from typing_extensions import TypeAlias from neural_compressor.common import logger @@ -120,11 +116,9 @@ def get_model_info(model: torch.nn.Module, white_module_list: List[Callable]) -> return filter_result -def get_double_quant_config(double_quant_type): +def get_double_quant_config_dict(double_quant_type="BNB_NF4"): from neural_compressor.torch.utils.constants import DOUBLE_QUANT_CONFIGS - if double_quant_type is None: - return {} assert double_quant_type in DOUBLE_QUANT_CONFIGS, "Supported double quant configs: {}".format( list(DOUBLE_QUANT_CONFIGS.keys()) ) @@ -170,61 +164,3 @@ def postprocess_model(model, mode, quantizer): elif mode == Mode.CONVERT or mode == Mode.QUANTIZE: if getattr(model, "quantizer", False): del model.quantizer - - -def create_quant_spec_from_config(dtype, sym, granularity, algo, is_dynamic=False) -> QuantizationSpec: - dtype_mapping: Dict[str, torch.dtype] = {"int8": torch.int8, "uint8": torch.uint8} - select_dtype = dtype_mapping[dtype] - min_max_mapping = {torch.int8: (-128, 127), torch.uint8: (0, 255)} - qscheme_mapping = { - "per_channel": {True: torch.per_channel_symmetric, False: torch.per_tensor_affine}, - "per_tensor": {True: torch.per_tensor_symmetric, False: torch.per_tensor_affine}, - } - observer_mapping = { - "placeholder": PlaceholderObserver, - "minmax": MinMaxObserver, - "kl": HistogramObserver, - } - # Force to use placeholder observer for dynamic quantization - if is_dynamic: - algo = "placeholder" - # algo - observer_or_fake_quant_ctr = observer_mapping[algo] - # qscheme - qscheme = qscheme_mapping[granularity][sym] - quantization_spec = QuantizationSpec( - dtype=select_dtype, - quant_min=min_max_mapping[select_dtype][0], - quant_max=min_max_mapping[select_dtype][1], - observer_or_fake_quant_ctr=observer_or_fake_quant_ctr, - qscheme=qscheme, - is_dynamic=is_dynamic, - ) - return quantization_spec - - -def _map_inc_config_to_torch_quant_config(inc_config, is_dynamic=False) -> QuantizationConfig: - default_quant_config = xiq.get_default_x86_inductor_quantization_config(is_dynamic=is_dynamic) - input_act_quant_spec = create_quant_spec_from_config( - inc_config.act_dtype, inc_config.act_sym, inc_config.act_granularity, inc_config.act_algo, is_dynamic=is_dynamic - ) - weight_quant_spec = create_quant_spec_from_config( - inc_config.w_dtype, inc_config.w_sym, inc_config.w_granularity, inc_config.w_algo - ) - quant_config = QuantizationConfig( - input_activation=input_act_quant_spec, - output_activation=default_quant_config.output_activation, - weight=weight_quant_spec, - bias=default_quant_config.bias, - is_qat=False, - ) - return quant_config - - -def create_xiq_quantizer_from_pt2e_config(config, is_dynamic=False) -> X86InductorQuantizer: - quantizer = xiq.X86InductorQuantizer() - # set global - global_config = _map_inc_config_to_torch_quant_config(config, is_dynamic) - quantizer.set_global(global_config) - # Skip the local config for now (need torch 2.4) - return quantizer diff --git a/requirements_pt.txt b/requirements_pt.txt index 67c5371c46c..6a012a75b5a 100644 --- a/requirements_pt.txt +++ b/requirements_pt.txt @@ -1,5 +1,4 @@ -auto-round -intel_extension_for_pytorch +numpy peft==0.10.0 psutil py-cpuinfo diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 12d4232efe3..d6b31bbca25 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -14,7 +14,7 @@ prepare, quantize, ) -from neural_compressor.torch.utils import accelerator +from neural_compressor.torch.utils import accelerator, is_hpex_available device = accelerator.current_device_name() @@ -76,6 +76,8 @@ def test_int_params(self, bits, use_sym, group_size, group_dim): model = convert(model) out = model(self.example_inputs)[0] assert (out != self.label).any(), "WOQ output should be different with raw output" + if is_hpex_available(): + assert "hpu" in out.device, "Neural Compressor should run on HPU when HPEX is available." if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1): assert torch.allclose(out, self.label, atol=0.01), "Accuracy gap atol > 0.01 is unexpected." if (bits, use_sym, group_size, group_dim) == [(4, True, 128, 0), (4, True, 32, 1)]: diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 28a91bccca8..bdf99d92cf0 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,4 +1,6 @@ +auto_round expecttest +intel_extension_for_pytorch numpy peft==0.10.0 prettytable diff --git a/test/3x/torch/test_auto_accelerator.py b/test/3x/torch/test_auto_accelerator.py deleted file mode 100644 index 918a54ebbd5..00000000000 --- a/test/3x/torch/test_auto_accelerator.py +++ /dev/null @@ -1,76 +0,0 @@ -import os - -import pytest -import torch - -from neural_compressor.torch.utils import get_accelerator -from neural_compressor.torch.utils.auto_accelerator import accelerator_registry, auto_detect_accelerator - - -class Test_CPU_Accelerator: - @pytest.fixture - def force_use_cpu(self, monkeypatch): - # Force use CPU - monkeypatch.setenv("FORCE_DEVICE", "cpu") - - def test_cpu_accelerator(self, force_use_cpu): - print(f"FORCE_DEVICE: {os.environ.get('FORCE_DEVICE', None)}") - accelerator = auto_detect_accelerator() - assert accelerator.current_device() == "cpu", f"{accelerator.current_device()}" - assert accelerator.current_device_name() == "cpu" - assert accelerator.is_available() - assert accelerator.set_device(1) is None - assert accelerator.device() is None - assert accelerator.empty_cache() is None - assert accelerator.synchronize() is None - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") -class Test_CUDA_Accelerator: - - @pytest.fixture - def force_use_cuda(self, monkeypatch): - # Force use CUDA - monkeypatch.setenv("FORCE_DEVICE", "cuda") - - def test_cuda_accelerator(self, force_use_cuda): - print(f"FORCE_DEVICE: {os.environ.get('FORCE_DEVICE', None)}") - accelerator = auto_detect_accelerator() - assert accelerator.current_device() == 0, f"{accelerator.current_device()}" - assert accelerator.current_device_name() == "cuda:0" - assert accelerator.device() is not None - assert accelerator.empty_cache() is None - assert accelerator.synchronize() is None - assert accelerator.set_device(0) is None - assert accelerator.device_name(0) == "cuda:0" - assert accelerator.is_available() is True - assert accelerator.name() == "cuda" - assert accelerator.device_name(1) == "cuda:1" - assert accelerator.set_device(1) is None - assert accelerator.device_name(1) == "cuda:1" - assert accelerator.current_device() == 1 - assert accelerator.current_device_name() == "cuda:1" - assert accelerator.synchronize() is None - assert accelerator.empty_cache() is None - - @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Only one GPU is available") - def test_get_device(self): - accelerator = auto_detect_accelerator() - assert accelerator.set_device(1) is None - assert accelerator.current_device_name() == "cuda:1" - cur_device = get_accelerator().current_device_name() - assert cur_device == "cuda:1" - tmp_tensor = torch.tensor([1, 2], device=cur_device) - assert "cuda:1" == str(tmp_tensor.device) - - -class TestAutoAccelerator: - - @pytest.fixture - def set_cuda_available(self, monkeypatch): - monkeypatch.setattr(torch.cuda, "is_available", lambda: True) - - def test_auto_accelerator(self, set_cuda_available): - accelerator = auto_detect_accelerator() - all_accelerators = accelerator_registry.get_sorted_accelerators() - assert accelerator.name() == all_accelerators[0]().name() diff --git a/test/3x/torch/test_utils.py b/test/3x/torch/test_utils.py deleted file mode 100644 index 00ca99a5734..00000000000 --- a/test/3x/torch/test_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -import unittest - -import torch - -from neural_compressor.torch.utils import logger - - -def get_gpt_j(): - import transformers - - tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained( - "hf-internal-testing/tiny-random-GPTJForCausalLM", - torchscript=True, - ) - return tiny_gptj - - -def build_simple_torch_model(): - class Model(torch.nn.Module): - def __init__(self): - super(Model, self).__init__() - self.fc1 = torch.nn.Linear(8, 30) - self.fc2 = torch.nn.Linear(30, 60) - self.fc3 = torch.nn.Linear(60, 30) - self.fc4 = torch.nn.Linear(30, 50) - - def forward(self, x): - out = self.fc1(x) - out = self.fc2(out) - out = self.fc3(out) - out = self.fc4(out) - return out - - model = Model() - return model - - -from neural_compressor.torch.utils.utility import fetch_module, set_module - - -class TestTorchUtils(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model = get_gpt_j() - - @classmethod - def tearDownClass(self): - pass - - def setUp(self): - # print the test name - logger.info(f"Running TestTorchUtils test: {self.id()}") - - def test_fetch_module(self): - result = fetch_module(self.model, "transformer.h.2.mlp.fc_in") - self.assertIsInstance(result, torch.nn.Linear) - - def test_set_module(self): - module_name = "transformer.h.2.mlp.fc_in" - mew_value = torch.nn.Linear(32, 128, bias=False) - set_module(self.model, module_name, mew_value) - result = fetch_module(self.model, module_name) - self.assertFalse(result.bias) - - def test_set_module_nonexistent_attribute(self): - new_value = torch.nn.Parameter(torch.Tensor([3.0])) - attr_name = "transformer.nonexistent_attr" - set_module(self.model, attr_name, new_value) - result = fetch_module(self.model, attr_name) - self.assertTrue(torch.equal(result, torch.Tensor([3.0]))) - - def test_fetch_module_nonexistent_attribute(self): - attr_name = "transformer.nonexistent_attr" - result = fetch_module(self.model, attr_name) - self.assertIsNone(result) - - def test_get_model_info(self): - from neural_compressor.torch.utils.utility import get_model_info - - white_module_list = [torch.nn.Linear] - model_info = get_model_info(build_simple_torch_model(), white_module_list) - self.assertEqual(len(model_info), 4) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/torch/utils/test_auto_accelerator.py b/test/3x/torch/utils/test_auto_accelerator.py new file mode 100644 index 00000000000..dea9cdce918 --- /dev/null +++ b/test/3x/torch/utils/test_auto_accelerator.py @@ -0,0 +1,138 @@ +import os + +import pytest +import torch + +from neural_compressor.torch.utils import get_accelerator +from neural_compressor.torch.utils.auto_accelerator import ( + CPU_Accelerator, + CUDA_Accelerator, + HPU_Accelerator, + XPU_Accelerator, + accelerator_registry, + auto_detect_accelerator, +) + + +@pytest.mark.skipif(not HPU_Accelerator.is_available(), reason="HPEX is not available") +class TestHPUAccelerator: + def test_cuda_accelerator(self): + assert os.environ.get("FORCE_DEVICE", None) is None, "FORCE_DEVICE shouldn't be set. HPU is the first priority." + accelerator = auto_detect_accelerator() + assert accelerator.current_device() == 0, f"{accelerator.current_device()}" + assert accelerator.current_device_name() == "hpu:0" + assert accelerator.device() is not None + assert accelerator.device_name(0) == "hpu:0" + assert accelerator.is_available() is True + assert accelerator.name() == "hpu" + assert accelerator.device_name(1) == "hpu:1" + assert accelerator.synchronize() is None + assert accelerator.empty_cache() is None + + def test_get_device(self): + if torch.hpu.device_count() < 2: + return + accelerator = auto_detect_accelerator() + assert accelerator.set_device(1) is None + assert accelerator.current_device_name() == "hpu:1" + cur_device = get_accelerator().current_device_name() + assert cur_device == "hpu:1" + tmp_tensor = torch.tensor([1, 2], device=cur_device) + assert "hpu:1" == str(tmp_tensor.device) + + +@pytest.mark.skipif(not XPU_Accelerator.is_available(), reason="XPU is not available") +class TestXPUAccelerator: + + @pytest.fixture + def force_use_xpu(self, monkeypatch): + # Force use xpu + monkeypatch.setenv("FORCE_DEVICE", "xpu") + + def test_xpu_accelerator(self, force_use_xpu): + print(f"FORCE_DEVICE: {os.environ.get('FORCE_DEVICE', None)}") + accelerator = auto_detect_accelerator() + assert accelerator.current_device() == 0, f"{accelerator.current_device()}" + assert accelerator.current_device_name() == "xpu:0" + assert accelerator.device() is not None + assert accelerator.set_device(0) is None + assert accelerator.device_name(0) == "xpu:0" + assert accelerator.is_available() is True + assert accelerator.name() == "xpu" + assert accelerator.device_name(1) == "xpu:1" + assert accelerator.synchronize() is None + assert accelerator.empty_cache() is None + + def test_get_device(self): + if torch.xpu.device_count() < 2: + return + accelerator = auto_detect_accelerator() + assert accelerator.set_device(1) is None + assert accelerator.current_device_name() == "xpu:1" + cur_device = get_accelerator().current_device_name() + assert cur_device == "xpu:1" + tmp_tensor = torch.tensor([1, 2], device=cur_device) + assert "xpu:1" == str(tmp_tensor.device) + + +class TestCPUAccelerator: + @pytest.fixture + def force_use_cpu(self, monkeypatch): + # Force use CPU + monkeypatch.setenv("FORCE_DEVICE", "cpu") + + def test_cpu_accelerator(self, force_use_cpu): + print(f"FORCE_DEVICE: {os.environ.get('FORCE_DEVICE', None)}") + accelerator = auto_detect_accelerator() + assert accelerator.current_device() == "cpu", f"{accelerator.current_device()}" + assert accelerator.current_device_name() == "cpu" + assert accelerator.is_available() + assert accelerator.set_device(1) is None + assert accelerator.device() is None + assert accelerator.empty_cache() is None + assert accelerator.synchronize() is None + + +@pytest.mark.skipif(not CUDA_Accelerator.is_available(), reason="CUDA is not available") +class TestCUDAAccelerator: + + @pytest.fixture + def force_use_cuda(self, monkeypatch): + # Force use CUDA + monkeypatch.setenv("FORCE_DEVICE", "cuda") + + def test_cuda_accelerator(self, force_use_cuda): + print(f"FORCE_DEVICE: {os.environ.get('FORCE_DEVICE', None)}") + accelerator = auto_detect_accelerator() + assert accelerator.current_device() == 0, f"{accelerator.current_device()}" + assert accelerator.current_device_name() == "cuda:0" + assert accelerator.device() is not None + assert accelerator.set_device(0) is None + assert accelerator.device_name(0) == "cuda:0" + assert accelerator.is_available() is True + assert accelerator.name() == "cuda" + assert accelerator.device_name(1) == "cuda:1" + assert accelerator.synchronize() is None + assert accelerator.empty_cache() is None + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Only one GPU is available") + def test_get_device(self): + accelerator = auto_detect_accelerator() + assert accelerator.set_device(1) is None + assert accelerator.current_device_name() == "cuda:1" + cur_device = get_accelerator().current_device_name() + assert cur_device == "cuda:1" + tmp_tensor = torch.tensor([1, 2], device=cur_device) + assert "cuda:1" == str(tmp_tensor.device) + + +class TestAutoAccelerator: + + @pytest.fixture + def set_cuda_available(self, monkeypatch): + monkeypatch.setattr(torch.cuda, "is_available", lambda: True) + + def test_auto_accelerator(self, set_cuda_available): + accelerator = auto_detect_accelerator() + all_accelerators = accelerator_registry.get_sorted_accelerators() + assert accelerator.name() == all_accelerators[0]().name() diff --git a/test/3x/torch/utils/test_torch_utility.py b/test/3x/torch/utils/test_torch_utility.py new file mode 100644 index 00000000000..b84db61ff7a --- /dev/null +++ b/test/3x/torch/utils/test_torch_utility.py @@ -0,0 +1,79 @@ +import pytest +import torch + +from neural_compressor.torch.utils.utility import get_double_quant_config_dict + + +def get_gpt_j(): + import transformers + + tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained( + "hf-internal-testing/tiny-random-GPTJForCausalLM", + torchscript=True, + ) + return tiny_gptj + + +def build_simple_torch_model(): + class Model(torch.nn.Module): + def __init__(self): + super(Model, self).__init__() + self.fc1 = torch.nn.Linear(8, 30) + self.fc2 = torch.nn.Linear(30, 60) + self.fc3 = torch.nn.Linear(60, 30) + self.fc4 = torch.nn.Linear(30, 50) + + def forward(self, x): + out = self.fc1(x) + out = self.fc2(out) + out = self.fc3(out) + out = self.fc4(out) + return out + + model = Model() + return model + + +from neural_compressor.torch.utils.utility import fetch_module, set_module + + +class TestTorchUtils: + def setup_class(self): + self.model = get_gpt_j() + + def teardown_class(self): + pass + + @pytest.mark.parametrize( + "module_name", + [ + "transformer.h.2.mlp.fc_in", + "transformer.nonexistent_attr", + ], + ) + def test_fetch_set_module(self, module_name): + # fetch + result = fetch_module(self.model, module_name) + if "nonexistent_attr" in module_name: + assert result is None, "result should be None" + else: + assert isinstance(result, torch.nn.Linear), "fetched module should be Linear" + assert result.bias is not None, "The bias of fetched module should not be None." + # set + new_value = torch.nn.Linear(32, 128, bias=False) + set_module(self.model, module_name, new_value) + result = fetch_module(self.model, module_name) + print(result) + assert result.bias is None, "The bias of new module should be None." + + def test_get_model_info(self): + from neural_compressor.torch.utils.utility import get_model_info + + white_module_list = [torch.nn.Linear] + model_info = get_model_info(build_simple_torch_model(), white_module_list) + assert len(model_info) == 4, "The length of model_info should be 4." + + @pytest.mark.parametrize("double_quant_type", ["BNB_NF4", "GGML_TYPE_Q4_K"]) + def test_double_quant_config_dict(self, double_quant_type): + config_dict = get_double_quant_config_dict(double_quant_type) + assert isinstance(config_dict, dict), "The returned object should be a dict."