From 9994a865142ca9994675e95c648d5833bcc5573f Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 12:38:43 +0800
Subject: [PATCH 1/4] support autoround v2.1

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .azure-pipelines/scripts/ut/env_setup.sh      |  2 +-
 neural_compressor/adaptor/pytorch.py          | 22 ++++++--
 .../adaptor/torch_utils/auto_round.py         |  6 +--
 .../adaptor/torch_utils/weight_only.py        | 52 +++++++++++++------
 neural_compressor/model/torch_model.py        |  2 +-
 .../test_weight_only_adaptor_pytorch.py       |  2 +-
 test/requirements.txt                         |  2 +-
 7 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/env_setup.sh b/.azure-pipelines/scripts/ut/env_setup.sh
index d5876b07cef..84e7fc654ec 100644
--- a/.azure-pipelines/scripts/ut/env_setup.sh
+++ b/.azure-pipelines/scripts/ut/env_setup.sh
@@ -92,7 +92,7 @@ elif [[ $(echo "${test_case}" | grep -c "tf pruning") != 0 ]]; then
 fi
 
 if [[ $(echo "${test_case}" | grep -c "api") != 0 ]] || [[ $(echo "${test_case}" | grep -c "adaptor") != 0 ]]; then
-    pip install auto-round
+    pip install git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
 fi
 
 # test deps
diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
index 530e15f2308..4df070d080f 100644
--- a/neural_compressor/adaptor/pytorch.py
+++ b/neural_compressor/adaptor/pytorch.py
@@ -4905,13 +4905,13 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
         enable_minmax_tuning = self.recipes["autoround_args"].get("enable_minmax_tuning", True)
         lr = self.recipes["autoround_args"].get("lr", None)
         minmax_lr = self.recipes["autoround_args"].get("minmax_lr", None)
-        low_gpu_mem_usage = self.recipes["autoround_args"].get("low_gpu_mem_usage", True)
+        low_gpu_mem_usage = self.recipes["autoround_args"].get("low_gpu_mem_usage", False)
         iters = self.recipes["autoround_args"].get("iters", 200)
         seqlen = self.recipes["autoround_args"].get("seqlen", 2048)
-        n_samples = self.recipes["autoround_args"].get("n_samples", 512)
+        nsamples = self.recipes["autoround_args"].get("nsamples", 128)
         sampler = self.recipes["autoround_args"].get("sampler", "rand")
         seed = self.recipes["autoround_args"].get("seed", 42)
-        n_blocks = self.recipes["autoround_args"].get("n_blocks", 1)
+        nblocks = self.recipes["autoround_args"].get("nblocks", 1)
         gradient_accumulate_steps = self.recipes["autoround_args"].get("gradient_accumulate_steps", 1)
         not_use_best_mse = self.recipes["autoround_args"].get("not_use_best_mse", False)
         dynamic_max_gap = self.recipes["autoround_args"].get("dynamic_max_gap", -1)
@@ -4922,6 +4922,12 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
         bits = self.recipes["autoround_args"].get("bits", 4)
         group_size = self.recipes["autoround_args"].get("group_size", 128)
         sym = self.recipes["autoround_args"].get("scheme", "asym") == "sym"
+        act_bits = self.recipes["autoround_args"].get("act_bits", 32)
+        act_group_size = self.recipes["autoround_args"].get("act_group_size", None)
+        act_sym = self.recipes["autoround_args"].get("act_sym", None)
+        act_dynamic = self.recipes["autoround_args"].get("act_dynamic", True)
+        multimodal = self.recipes["autoround_args"].get("multimodal", False)
+        use_layer_wise = self.recipes["autoround_args"].get("use_layer_wise", False)
 
         if dataloader is not None:
             dataset = dataloader
@@ -4944,15 +4950,21 @@ def autoround_quantize(self, model, tune_cfg, dataloader):
             low_gpu_mem_usage=low_gpu_mem_usage,
             iters=iters,
             seqlen=seqlen,
-            n_samples=n_samples,
+            nsamples=nsamples,
             sampler=sampler,
             seed=seed,
-            n_blocks=n_blocks,
+            nblocks=nblocks,
             gradient_accumulate_steps=gradient_accumulate_steps,
             not_use_best_mse=not_use_best_mse,
             dynamic_max_gap=dynamic_max_gap,
             data_type=data_type,
             scale_dtype=scale_dtype,
+            multimodal=multimodal,
+            act_bits=act_bits,
+            act_group_size=act_group_size,
+            act_sym=act_sym,
+            act_dynamic=act_dynamic,
+            use_layer_wise=use_layer_wise,
         )
         return model, autoround_config
 
diff --git a/neural_compressor/adaptor/torch_utils/auto_round.py b/neural_compressor/adaptor/torch_utils/auto_round.py
index 78eca517221..a0de5ef2a75 100644
--- a/neural_compressor/adaptor/torch_utils/auto_round.py
+++ b/neural_compressor/adaptor/torch_utils/auto_round.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=512):
+def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=128):
     """Generate a DataLoader for calibration using specified parameters.
 
     Args:
@@ -25,7 +25,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
         split (str, optional): The data split to use. Defaults to None.
         seed (int, optional): The random seed for reproducibility. Defaults to 42.
         bs (int, optional): The batch size. Defaults to 4.
-        n_samples (int, optional): The total number of samples to include. Defaults to 512.
+        n_samples (int, optional): The total number of samples to include. Defaults to 128.
 
     Returns:
         DataLoader: The DataLoader for the calibrated dataset.
@@ -33,6 +33,6 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
     from auto_round.calib_dataset import get_dataloader  # pylint: disable=E0401
 
     dataloader = get_dataloader(
-        tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=seed, bs=bs, n_samples=n_samples
+        tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples
     )
     return dataloader
diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
index c6bcf5f09e2..2adb6ec48e3 100644
--- a/neural_compressor/adaptor/torch_utils/weight_only.py
+++ b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -694,18 +694,24 @@ def autoround_quantize(
     enable_minmax_tuning: bool = True,
     lr: float = None,
     minmax_lr: float = None,
-    low_gpu_mem_usage: bool = True,
+    low_gpu_mem_usage: bool = False,
     iters: int = 200,
     seqlen: int = 2048,
-    n_samples: int = 512,
+    nsamples: int = 128,
     sampler: str = "rand",
     seed: int = 42,
-    n_blocks: int = 1,
+    nblocks: int = 1,
     gradient_accumulate_steps: int = 1,
     not_use_best_mse: bool = False,
     dynamic_max_gap: int = -1,
     data_type: str = "int",  ##only support int for now
     scale_dtype: str = "fp16",
+    multimodal: bool = False,
+    act_bits: int = 32,
+    act_group_size: int = None,
+    act_sym: bool = None,
+    act_dynamic: bool = True,
+    use_layer_wise: bool = False,
     **kwargs,
 ):
     """Run autoround weight-only quantization.
@@ -717,15 +723,19 @@ def autoround_quantize(
         sym (bool): Whether symmetric quantization is to be used (default is False).
         weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
         weight_config={
-                   'layer1':##layer_name
-                   {
-                       'data_type': 'int',
-                       'bits': 4,
-                       'group_size': 32,
-                       'sym': False
-                   }
-                   ...
-               }
+                    'layer1':##layer_name
+                        {
+                            'data_type': 'int',
+                            'bits': 4,
+                            'group_size': 32,
+                            'sym': False,
+                            'act_data_type': None,
+                            'act_bits': 32,
+                            'act_sym': None,
+                            'act_dynamic': True,
+                        }
+                    ...,
+                }
         enable_full_range (bool): Whether to enable full range quantization (default is False).
         batch_size (int): Batch size for training (default is 8).
         amp (bool): Whether to use automatic mixed precision (default is True).
@@ -737,7 +747,7 @@ def autoround_quantize(
         enable_minmax_tuning (bool): Whether to enable weight min-max tuning (default is True).
         lr (float): The learning rate (default is None, will be set to 1.0/iters).
         minmax_lr (float): The learning rate for min-max tuning (default is None, it will be set to lr automatically).
-        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is True).
+        low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
         iters (int): Number of iterations (default is 200).
         seqlen (int): Data length of the sequence for tuning (default is 2048).
         n_samples (int): Number of samples (default is 512).
@@ -750,7 +760,11 @@ def autoround_quantize(
         data_type (str): The data type to be used (default is "int").
         scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
                            have different choices.
-
+        multimodal(bool): Enable multimodal model quantization, (default is "False").
+        act_bits (int): Number of bits for activation quantization. Default is 32.
+        act_group_size (int): Group size for activation quantization. Default is None.
+        act_sym (bool): Whether to use symmetric activation quantization. Default is None.
+        act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
     Returns:
         The quantized model.
     """
@@ -776,15 +790,21 @@ def autoround_quantize(
         low_gpu_mem_usage=low_gpu_mem_usage,
         iters=iters,
         seqlen=seqlen,
-        n_samples=n_samples,
+        nsamples=nsamples,
         sampler=sampler,
         seed=seed,
-        n_blocks=n_blocks,
+        nblocks=nblocks,
         gradient_accumulate_steps=gradient_accumulate_steps,
         not_use_best_mse=not_use_best_mse,
         dynamic_max_gap=dynamic_max_gap,
         data_type=data_type,  ## only support data_type
         scale_dtype=scale_dtype,
+        multimodal=multimodal,
+        act_bits=act_bits,
+        act_group_size=act_group_size,
+        act_sym=act_sym,
+        act_dynamic=act_dynamic,
+        low_cpu_mem_usage=use_layer_wise,
         **kwargs,
     )
     qdq_model, weight_config = rounder.quantize()
diff --git a/neural_compressor/model/torch_model.py b/neural_compressor/model/torch_model.py
index 7338f196d46..91e189777b0 100644
--- a/neural_compressor/model/torch_model.py
+++ b/neural_compressor/model/torch_model.py
@@ -609,7 +609,7 @@ def export_compressed_model(
 
                 self.model = pack_model(
                     self.model,
-                    weight_config=autoround_config,
+                    layer_config=autoround_config,
                     enable_full_range=enable_full_range,
                     compression_dtype=compression_dtype,
                     compression_dim=compression_dim,
diff --git a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py
index c3839a80b8a..61456b923f8 100644
--- a/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py
+++ b/test/adaptor/pytorch_adaptor/test_weight_only_adaptor_pytorch.py
@@ -760,7 +760,7 @@ def test_AutoRound_quant(self):
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
         )
-        dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, n_samples=20)
+        dataloader = get_dataloader(tokenizer, 32, dataset_name="NeelNanda/pile-10k", seed=42, bs=8, nsamples=20)
         fp32_model = copy.deepcopy(self.gptj)
         conf = PostTrainingQuantConfig(
             approach="weight_only",
diff --git a/test/requirements.txt b/test/requirements.txt
index 3a24001cfd2..386b3ee4f2b 100644
--- a/test/requirements.txt
+++ b/test/requirements.txt
@@ -1,6 +1,6 @@
 --find-links https://download.pytorch.org/whl/torch_stable.html
 accelerate==0.21.0
-auto-round
+auto-round @ git+https://github.com/intel/auto-round.git@24b2e74070f2b4e6f26ff069ec75af74cf5b177c
 dynast==1.6.0rc1
 horovod
 intel-extension-for-pytorch

From 8712130561016a8a4e7866ef50036c0926424b4a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jul 2024 04:43:31 +0000
Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/adaptor/torch_utils/auto_round.py  | 4 +---
 neural_compressor/adaptor/torch_utils/weight_only.py | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/auto_round.py b/neural_compressor/adaptor/torch_utils/auto_round.py
index a0de5ef2a75..b5aca6258dd 100644
--- a/neural_compressor/adaptor/torch_utils/auto_round.py
+++ b/neural_compressor/adaptor/torch_utils/auto_round.py
@@ -32,7 +32,5 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
     """
     from auto_round.calib_dataset import get_dataloader  # pylint: disable=E0401
 
-    dataloader = get_dataloader(
-        tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples
-    )
+    dataloader = get_dataloader(tokenizer, seqlen, dataset_name=dataset_name, seed=seed, bs=bs, nsamples=nsamples)
     return dataloader
diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
index 2adb6ec48e3..ffd5789c612 100644
--- a/neural_compressor/adaptor/torch_utils/weight_only.py
+++ b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -715,6 +715,7 @@ def autoround_quantize(
     **kwargs,
 ):
     """Run autoround weight-only quantization.
+
     Args:
         model: The PyTorch model to be quantized.
         tokenizer: An optional tokenizer for processing input data. If none is provided, a dataloader must be supplied.

From 8e472f91e71bd8a3ba0c3bbc946a106f1d4853bd Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 12:44:21 +0800
Subject: [PATCH 3/4] fix docstring

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/adaptor/torch_utils/auto_round.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/auto_round.py b/neural_compressor/adaptor/torch_utils/auto_round.py
index b5aca6258dd..9fd8667e801 100644
--- a/neural_compressor/adaptor/torch_utils/auto_round.py
+++ b/neural_compressor/adaptor/torch_utils/auto_round.py
@@ -25,7 +25,7 @@ def get_dataloader(tokenizer, seqlen, dataset_name="NeelNanda/pile-10k", seed=42
         split (str, optional): The data split to use. Defaults to None.
         seed (int, optional): The random seed for reproducibility. Defaults to 42.
         bs (int, optional): The batch size. Defaults to 4.
-        n_samples (int, optional): The total number of samples to include. Defaults to 128.
+        nsamples (int, optional): The total number of samples to include. Defaults to 128.
 
     Returns:
         DataLoader: The DataLoader for the calibrated dataset.

From 4f00f30eefd035f37ab54a5e001e8d97cb7cc6c3 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Tue, 16 Jul 2024 14:08:58 +0800
Subject: [PATCH 4/4] update docstring & layer_config

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 neural_compressor/adaptor/torch_utils/weight_only.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/weight_only.py b/neural_compressor/adaptor/torch_utils/weight_only.py
index ffd5789c612..5e21b97d10a 100644
--- a/neural_compressor/adaptor/torch_utils/weight_only.py
+++ b/neural_compressor/adaptor/torch_utils/weight_only.py
@@ -751,10 +751,10 @@ def autoround_quantize(
         low_gpu_mem_usage (bool): Whether to use low GPU memory (default is False).
         iters (int): Number of iterations (default is 200).
         seqlen (int): Data length of the sequence for tuning (default is 2048).
-        n_samples (int): Number of samples (default is 512).
+        nsamples (int): Number of samples (default is 128).
         sampler (str): The sampling method (default is "rand").
         seed (int): The random seed (default is 42).
-        n_blocks (int): Number of blocks (default is 1).
+        nblocks (int): Number of blocks (default is 1).
         gradient_accumulate_steps (int): Number of gradient accumulation steps (default is 1).
         not_use_best_mse (bool): Whether to use mean squared error (default is False).
         dynamic_max_gap (int): The dynamic maximum gap (default is -1).
@@ -777,7 +777,7 @@ def autoround_quantize(
         bits=bits,
         group_size=group_size,
         sym=sym,
-        weight_config=weight_config,
+        layer_config=weight_config,
         enable_full_range=enable_full_range,  ##for symmetric, TODO support later
         batch_size=batch_size,
         amp=amp,