From 16a7b11508c008d4d4180a0fe0e31c75b8e5d662 Mon Sep 17 00:00:00 2001
From: Yi Liu <106061964+yiliu30@users.noreply.github.com>
Date: Thu, 11 Jul 2024 17:13:24 +0800
Subject: [PATCH] Get default config based on the auto-detect CPU type (#1904)

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 neural_compressor/common/utils/constants.py   |   3 +
 neural_compressor/common/utils/utility.py     | 132 +++++++++++++++++-
 .../torch/quantization/config.py              |  48 ++++---
 neural_compressor/torch/utils/utility.py      |  41 +++++-
 test/3x/common/test_utility.py                |  50 ++++++-
 test/3x/torch/test_config.py                  |  54 +++++--
 6 files changed, 291 insertions(+), 37 deletions(-)

diff --git a/neural_compressor/common/utils/constants.py b/neural_compressor/common/utils/constants.py
index adf7755003b..76846682fd4 100644
--- a/neural_compressor/common/utils/constants.py
+++ b/neural_compressor/common/utils/constants.py
@@ -56,3 +56,6 @@ class Mode(Enum):
     PREPARE = "prepare"
     CONVERT = "convert"
     QUANTIZE = "quantize"
+
+
+SERVER_PROCESSOR_BRAND_KEY_WORLD_LST = ["Xeon"]
diff --git a/neural_compressor/common/utils/utility.py b/neural_compressor/common/utils/utility.py
index 8ba28d7512d..56326246d85 100644
--- a/neural_compressor/common/utils/utility.py
+++ b/neural_compressor/common/utils/utility.py
@@ -17,6 +17,7 @@
 """The utility of common module."""
 
 import collections
+import enum
 import importlib
 import subprocess
 import time
@@ -26,7 +27,7 @@
 import psutil
 from prettytable import PrettyTable
 
-from neural_compressor.common.utils import Mode, TuningLogger, logger
+from neural_compressor.common.utils import Mode, TuningLogger, constants, logger
 
 __all__ = [
     "set_workspace",
@@ -41,6 +42,9 @@
     "CpuInfo",
     "default_tuning_logger",
     "call_counter",
+    "cpu_info",
+    "ProcessorType",
+    "detect_processor_type_based_on_hw",
     "Statistics",
 ]
 
@@ -92,7 +96,7 @@ def __call__(self, *args, **kwargs):
 
 @singleton
 class CpuInfo(object):
-    """CPU info collection."""
+    """Get CPU Info."""
 
     def __init__(self):
         """Get whether the cpu numerical format is bf16, the number of sockets, cores and cores per socket."""
@@ -113,6 +117,39 @@ def __init__(self):
                     b"\xB8\x07\x00\x00\x00" b"\x0f\xa2" b"\xC3",  # mov eax, 7  # cpuid  # ret
                 )
                 self._bf16 = bool(eax & (1 << 5))
+        self._info = info
+        self._brand_raw = info.get("brand_raw", "")
+        # detect the below info when needed
+        self._cores = None
+        self._sockets = None
+        self._cores_per_socket = None
+
+    @property
+    def brand_raw(self):
+        """Get the brand name of the CPU."""
+        return self._brand_raw
+
+    @brand_raw.setter
+    def brand_raw(self, brand_name):
+        """Set the brand name of the CPU."""
+        self._brand_raw = brand_name
+
+    @staticmethod
+    def _detect_cores():
+        physical_cores = psutil.cpu_count(logical=False)
+        return physical_cores
+
+    @property
+    def cores(self):
+        """Get the number of cores in platform."""
+        if self._cores is None:
+            self._cores = self._detect_cores()
+        return self._cores
+
+    @cores.setter
+    def cores(self, num_of_cores):
+        """Set the number of cores in platform."""
+        self._cores = num_of_cores
 
     @property
     def bf16(self):
@@ -124,6 +161,60 @@ def vnni(self):
         """Get whether it is vnni."""
         return self._vnni
 
+    @property
+    def cores_per_socket(self) -> int:
+        """Get the cores per socket."""
+        if self._cores_per_socket is None:
+            self._cores_per_socket = self.cores // self.sockets
+        return self._cores_per_socket
+
+    @property
+    def sockets(self):
+        """Get the number of sockets in platform."""
+        if self._sockets is None:
+            self._sockets = self._get_number_of_sockets()
+        return self._sockets
+
+    @sockets.setter
+    def sockets(self, num_of_sockets):
+        """Set the number of sockets in platform."""
+        self._sockets = num_of_sockets
+
+    def _get_number_of_sockets(self) -> int:
+        if "arch" in self._info and "ARM" in self._info["arch"]:  # pragma: no cover
+            return 1
+
+        num_sockets = None
+        cmd = "cat /proc/cpuinfo | grep 'physical id' | sort -u | wc -l"
+        if psutil.WINDOWS:
+            cmd = r'wmic cpu get DeviceID | C:\Windows\System32\find.exe /C "CPU"'
+        elif psutil.MACOS:  # pragma: no cover
+            cmd = "sysctl -n machdep.cpu.core_count"
+
+        num_sockets = None
+        try:
+            with subprocess.Popen(
+                args=cmd,
+                shell=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=False,
+            ) as proc:
+                proc.wait()
+                if proc.stdout:
+                    for line in proc.stdout:
+                        num_sockets = int(line.decode("utf-8", errors="ignore").strip())
+        except Exception as e:
+            logger.error("Failed to get number of sockets: %s" % e)
+        if isinstance(num_sockets, int) and num_sockets >= 1:
+            return num_sockets
+        else:
+            logger.warning("Failed to get number of sockets, return 1 as default.")
+            return 1
+
+
+cpu_info = CpuInfo()
+
 
 def dump_elapsed_time(customized_msg=""):
     """Get the elapsed time for decorated functions.
@@ -236,6 +327,43 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+class ProcessorType(enum.Enum):
+    Client = "Client"
+    Server = "Server"
+
+
+def detect_processor_type_based_on_hw():
+    """Detects the processor type based on the hardware configuration.
+
+    Returns:
+        ProcessorType: The detected processor type (Server or Client).
+    """
+    # Detect the processor type based on below conditions:
+    #   If there are more than one sockets, it is a server.
+    #   If the brand name includes key word in `SERVER_PROCESSOR_BRAND_KEY_WORLD_LST`, it is a server.
+    #   If the memory size is greater than 32GB, it is a server.
+    log_mgs = "Processor type detected as {processor_type} due to {reason}."
+    if cpu_info.sockets > 1:
+        logger.info(log_mgs.format(processor_type=ProcessorType.Server.value, reason="there are more than one sockets"))
+        return ProcessorType.Server
+    elif any(brand in cpu_info.brand_raw for brand in constants.SERVER_PROCESSOR_BRAND_KEY_WORLD_LST):
+        logger.info(
+            log_mgs.format(processor_type=ProcessorType.Server.value, reason=f"the brand name is {cpu_info.brand_raw}.")
+        )
+        return ProcessorType.Server
+    elif psutil.virtual_memory().total / (1024**3) > 32:
+        logger.info(
+            log_mgs.format(processor_type=ProcessorType.Server.value, reason="the memory size is greater than 32GB")
+        )
+        return ProcessorType.Server
+    else:
+        logger.info(
+            "Processor type detected as %s, pass `processor_type='server'` to override it if needed.",
+            ProcessorType.Client.value,
+        )
+        return ProcessorType.Client
+
+
 class Statistics:
     """The statistics printer."""
 
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 71b01353d5a..9014f1576a3 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -23,6 +23,7 @@
 
 import torch
 
+import neural_compressor.torch.utils as torch_utils
 from neural_compressor.common.base_config import (
     BaseConfig,
     config_registry,
@@ -219,14 +220,17 @@ def get_config_set_for_tuning(cls) -> Union[None, "RTNConfig", List["RTNConfig"]
             dtype=["int4", "nf4"], use_sym=[True, False], group_size=[32, 128], use_mse_search=[False, True]
         )
 
+    @classmethod
+    def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "RTNConfig"]:
+        pre_defined_configs: Dict[torch_utils.ProcessorType, RTNConfig] = {}
+        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)
+        pre_defined_configs[torch_utils.ProcessorType.Server] = cls()
+        return pre_defined_configs
 
-def get_default_rtn_config() -> RTNConfig:
-    """Generate the default rtn config.
 
-    Returns:
-        the default rtn config.
-    """
-    return RTNConfig()
+def get_default_rtn_config(processor_type: Optional[Union[str, torch_utils.ProcessorType]] = None) -> RTNConfig:
+    process_type = torch_utils.get_processor_type_from_user_config(processor_type)
+    return RTNConfig.get_predefined_configs()[process_type]
 
 
 def get_default_double_quant_config(type="BNB_NF4"):
@@ -378,14 +382,17 @@ def get_config_set_for_tuning(cls) -> Union[None, "GPTQConfig", List["GPTQConfig
         # TODO fwk owner needs to update it.
         return GPTQConfig(act_order=[True, False], use_sym=[False, True])
 
+    @classmethod
+    def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "GPTQConfig"]:
+        pre_defined_configs: Dict[torch_utils.ProcessorType, GPTQConfig] = {}
+        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)
+        pre_defined_configs[torch_utils.ProcessorType.Server] = cls()
+        return pre_defined_configs
 
-def get_default_gptq_config() -> GPTQConfig:
-    """Generate the default gptq config.
 
-    Returns:
-        the default gptq config.
-    """
-    return GPTQConfig()
+def get_default_gptq_config(processor_type: Optional[Union[str, torch_utils.ProcessorType]] = None) -> RTNConfig:
+    process_type = torch_utils.get_processor_type_from_user_config(processor_type)
+    return GPTQConfig.get_predefined_configs()[process_type]
 
 
 ######################## AWQ Config ###############################
@@ -725,6 +732,7 @@ def __init__(
         not_use_best_mse: bool = False,
         dynamic_max_gap: int = -1,
         scale_dtype: str = "fp16",
+        use_layer_wise: bool = False,
         white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
     ):
         """Init AUTOROUND weight-only quantization config.
@@ -777,6 +785,7 @@ def __init__(
         self.not_use_best_mse = not_use_best_mse
         self.dynamic_max_gap = dynamic_max_gap
         self.scale_dtype = scale_dtype
+        self.use_layer_wise = use_layer_wise
         self._post_init()
 
     @classmethod
@@ -803,14 +812,17 @@ def get_config_set_for_tuning(cls) -> Union[None, "AutoRoundConfig", List["AutoR
         # TODO fwk owner needs to update it.
         return AutoRoundConfig(bits=[4, 6])
 
+    @classmethod
+    def get_predefined_configs(cls) -> Dict[torch_utils.ProcessorType, "AutoRoundConfig"]:
+        pre_defined_configs: Dict[torch_utils.ProcessorType, AutoRoundConfig] = {}
+        pre_defined_configs[torch_utils.ProcessorType.Client] = cls(use_layer_wise=True)
+        pre_defined_configs[torch_utils.ProcessorType.Server] = cls()
+        return pre_defined_configs
 
-def get_default_AutoRound_config() -> AutoRoundConfig:
-    """Generate the default AUTOROUND config.
 
-    Returns:
-        the default AUTOROUND config.
-    """
-    return AutoRoundConfig()
+def get_default_AutoRound_config(processor_type: Optional[Union[str, torch_utils.ProcessorType]] = None) -> RTNConfig:
+    process_type = torch_utils.get_processor_type_from_user_config(processor_type)
+    return AutoRoundConfig.get_predefined_configs()[process_type]
 
 
 ######################## MX Config ###############################
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index bf1bb2a77b1..599be8578f4 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -13,12 +13,21 @@
 # limitations under the License.
 
 
-from typing import Callable, Dict, List, Tuple, Union
+import enum
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
+import psutil
 import torch
 from typing_extensions import TypeAlias
 
-from neural_compressor.common.utils import Mode, Statistics, logger
+from neural_compressor.common.utils import (
+    Mode,
+    ProcessorType,
+    Statistics,
+    cpu_info,
+    detect_processor_type_based_on_hw,
+    logger,
+)
 
 OP_NAME_AND_TYPE_TUPLE_TYPE: TypeAlias = Tuple[str, Union[torch.nn.Module, Callable]]
 
@@ -235,3 +244,31 @@ def get_model_device(model: torch.nn.Module):
     """
     for n, p in model.named_parameters():
         return p.data.device.type  # p.data.device == device(type='cpu')
+
+
+def get_processor_type_from_user_config(user_processor_type: Optional[Union[str, ProcessorType]] = None):
+    """Get the processor type.
+
+    Get the processor type based on the user configuration or automatically detect it based on the hardware.
+
+    Args:
+        user_processor_type (Optional[Union[str, ProcessorType]]): The user-specified processor type. Defaults to None.
+
+    Returns:
+        ProcessorType: The detected or user-specified processor type.
+
+    Raises:
+        AssertionError: If the user-specified processor type is not supported.
+        NotImplementedError: If the processor type is not recognized.
+    """
+    if user_processor_type is None:
+        processor_type = detect_processor_type_based_on_hw()
+    elif isinstance(user_processor_type, ProcessorType):
+        processor_type = user_processor_type
+    elif isinstance(user_processor_type, str):
+        user_processor_type = user_processor_type.lower().capitalize()
+        assert user_processor_type in ProcessorType.__members__, f"Unsupported processor type: {user_processor_type}"
+        processor_type = ProcessorType(user_processor_type)
+    else:
+        raise NotImplementedError(f"Unsupported processor type: {user_processor_type}")
+    return processor_type
diff --git a/test/3x/common/test_utility.py b/test/3x/common/test_utility.py
index b605b3b506b..fd349ce1706 100644
--- a/test/3x/common/test_utility.py
+++ b/test/3x/common/test_utility.py
@@ -11,6 +11,8 @@
 import unittest
 from unittest.mock import MagicMock, patch
 
+import pytest
+
 import neural_compressor.common.utils.utility as inc_utils
 from neural_compressor.common import options
 from neural_compressor.common.utils import (
@@ -41,7 +43,7 @@ def test_set_random_seed(self):
             set_random_seed(seed)
 
     def test_set_workspace(self):
-        workspace = "/path/to/workspace"
+        workspace = "/tmp/inc_workspace"
         set_workspace(workspace)
         self.assertEqual(options.workspace, workspace)
         returned_workspace = get_workspace()
@@ -78,6 +80,9 @@ def test_cpu_info(self):
         cpu_info = CpuInfo()
         assert isinstance(cpu_info.bf16, bool), "bf16 should be a boolean"
         assert isinstance(cpu_info.vnni, bool), "avx512 should be a boolean"
+        assert cpu_info.cores >= 1
+        assert cpu_info.sockets >= 1
+        assert cpu_info.cores_per_socket >= 1
 
 
 class TestLazyImport(unittest.TestCase):
@@ -113,6 +118,11 @@ def test_lazy_import_access_attr(self):
 
         self.assertIsNotNone(lazy_import.module)
 
+    def test_call_method_module_not_found(self):
+        with self.assertRaises(ImportError):
+            lazy_import = LazyImport("non_existent_module")
+            lazy_import(3, 4)
+
 
 class TestUtils(unittest.TestCase):
     def test_dump_elapsed_time(self):
@@ -190,5 +200,39 @@ def add(a, b):
         self.assertEqual(inc_utils.FUNC_CALL_COUNTS["add"], 3)
 
 
-if __name__ == "__main__":
-    unittest.main()
+class TestAutoDetectProcessorType:
+    @pytest.fixture
+    def force_client(self, monkeypatch):
+        monkeypatch.setattr(inc_utils.cpu_info, "sockets", 1)
+        monkeypatch.setattr(inc_utils.cpu_info, "brand_raw", "")
+
+        # force the ram size detected by psutil <= 64GB
+        class MockMemory:
+            def __init__(self, total):
+                self.total = total
+
+        # Patch the psutil.virtual_memory() method
+        monkeypatch.setattr(inc_utils.psutil, "virtual_memory", lambda: MockMemory(16 * 1024**3))
+
+    def test_auto_detect_processor_type(self, force_client):
+        p_type = inc_utils.detect_processor_type_based_on_hw()
+        assert (
+            p_type == inc_utils.ProcessorType.Client
+        ), f"Expect processor type to be {inc_utils.ProcessorType.Client}, got {p_type}"
+
+    def test_detect_processor_type_based_on_hw(self):
+        # Test when the brand name includes a server keyword
+        inc_utils.cpu_info.brand_raw = "Intel Xeon Server"
+        assert inc_utils.detect_processor_type_based_on_hw() == inc_utils.ProcessorType.Server
+
+        # Test when the memory size is greater than 32GB
+        with patch("psutil.virtual_memory") as mock_virtual_memory:
+            mock_virtual_memory.return_value.total = 64 * 1024**3
+            assert inc_utils.detect_processor_type_based_on_hw() == inc_utils.ProcessorType.Server
+
+        # Test when none of the conditions are met
+        inc_utils.cpu_info.sockets = 1
+        inc_utils.cpu_info.brand_raw = "Intel Core i7"
+        with patch("psutil.virtual_memory") as mock_virtual_memory:
+            mock_virtual_memory.return_value.total = 16 * 1024**3
+            assert inc_utils.detect_processor_type_based_on_hw() == inc_utils.ProcessorType.Client
diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py
index c5bdc5261cf..68e7d5975cc 100644
--- a/test/3x/torch/test_config.py
+++ b/test/3x/torch/test_config.py
@@ -1,9 +1,11 @@
 import copy
 import unittest
 
+import pytest
 import torch
 import transformers
 
+import neural_compressor.torch.utils as torch_utils
 from neural_compressor.torch.quantization import (
     AutoRoundConfig,
     AWQConfig,
@@ -13,6 +15,8 @@
     SmoothQuantConfig,
     StaticQuantConfig,
     TEQConfig,
+    get_default_AutoRound_config,
+    get_default_gptq_config,
     get_default_hqq_config,
     get_default_rtn_config,
     quantize,
@@ -331,15 +335,41 @@ def test_hqq_config(self):
         self.assertEqual(hqq_config.to_dict(), hqq_config2.to_dict())
 
 
-class TestQuantConfigForAutotune(unittest.TestCase):
-    def test_expand_config(self):
-        # test the expand functionalities, the user is not aware it
-
-        tune_config = RTNConfig(bits=[4, 6])
-        expand_config_list = RTNConfig.expand(tune_config)
-        self.assertEqual(expand_config_list[0].bits, 4)
-        self.assertEqual(expand_config_list[1].bits, 6)
-
-
-if __name__ == "__main__":
-    unittest.main()
+class TestQuantConfigBasedonProcessorType:
+
+    @pytest.mark.parametrize("config_cls", [RTNConfig, GPTQConfig, AutoRoundConfig])
+    def test_get_config_based_on_processor_type(self, config_cls):
+        config_for_client = config_cls.get_predefined_configs()[torch_utils.ProcessorType.Client]
+        assert (
+            config_for_client.use_layer_wise
+        ), f"Expect use_layer_wise to be True, got {config_for_client.use_layer_wise}"
+
+        config_for_server = config_cls.get_predefined_configs()[torch_utils.ProcessorType.Server]
+        assert (
+            config_for_server.use_layer_wise is False
+        ), f"Expect use_layer_wise to be False, got {config_for_server.use_layer_wise}"
+
+    @pytest.fixture
+    def force_server(self, monkeypatch):
+        monkeypatch.setattr(torch_utils.utility.cpu_info, "sockets", 2)
+
+    def test_get_default_config_force_server(self, force_server):
+        rtn_config = get_default_rtn_config()
+        assert not rtn_config.use_layer_wise, f"Expect use_layer_wise to be `False`, got {rtn_config.use_layer_wise}"
+        gptq_config = get_default_gptq_config()
+        assert not gptq_config.use_layer_wise, f"Expect use_layer_wise to be `False`, got {gptq_config.use_layer_wise}"
+
+    @pytest.mark.parametrize("p_type", [None, torch_utils.ProcessorType.Client, torch_utils.ProcessorType.Server])
+    def test_get_default_config(self, p_type):
+        rtn_config = get_default_rtn_config(processor_type=p_type)
+        assert rtn_config.use_layer_wise == (
+            p_type == torch_utils.ProcessorType.Client
+        ), f"Expect use_layer_wise to be {p_type == torch_utils.ProcessorType.Client}, got {rtn_config.use_layer_wise}"
+        gptq_config = get_default_gptq_config(processor_type=p_type)
+        assert gptq_config.use_layer_wise == (
+            p_type == torch_utils.ProcessorType.Client
+        ), f"Expect use_layer_wise to be {p_type == torch_utils.ProcessorType.Client}, got {gptq_config.use_layer_wise}"
+        autoround_config = get_default_AutoRound_config(processor_type=p_type)
+        assert autoround_config.use_layer_wise == (
+            p_type == torch_utils.ProcessorType.Client
+        ), f"Expect use_layer_wise to be {p_type == torch_utils.ProcessorType.Client}, got {autoround_config.use_layer_wise}"