From a9df7da60927d387eaebbb0f630ee0ec90c75f74 Mon Sep 17 00:00:00 2001
From: yan tomsinsky <ytomsinsky@habana.ai>
Date: Sun, 19 May 2024 16:39:09 +0300
Subject: [PATCH 01/51] [SW-184941] INC CI, CD and Promotion

Change-Id: I60c420f9776e1bdab7bb9e02e5bcbdb6891bfe52
---
 requirements_pt.txt | 1 -
 setup.py            | 8 +++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/requirements_pt.txt b/requirements_pt.txt
index c3891a27b99..018b1b9dbf6 100644
--- a/requirements_pt.txt
+++ b/requirements_pt.txt
@@ -3,6 +3,5 @@ numpy < 2.0
 peft
 prettytable
 psutil
-py-cpuinfo
 pydantic
 tbb
diff --git a/setup.py b/setup.py
index bb23ac7866a..f4706563d00 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 import os
 import re
-import subprocess
 import sys
 from io import open
 
@@ -135,11 +134,10 @@ def get_build_version():
         description="Repository of Intel® Neural Compressor",
         long_description=open("README.md", "r", encoding="utf-8").read(),
         long_description_content_type="text/markdown",
-        keywords="quantization,auto-tuning,post-training static quantization,"
-        "post-training dynamic quantization,quantization-aware training",
         license="Apache 2.0",
-        url="https://github.com/intel/neural-compressor",
-        packages=include_packages,
+        keywords="quantization",
+        url="",
+        packages=find_packages(include=['neural_compressor', 'neural_compressor.*']),
         include_package_data=True,
         package_data=package_data,
         install_requires=install_requires,

From 14f031e516262e3a953febb71d8e2b6b2e0bec18 Mon Sep 17 00:00:00 2001
From: Ron Ben Moshe <rbenmoshe@habana.ai>
Date: Thu, 6 Jun 2024 10:58:15 +0300
Subject: [PATCH 02/51] [SW-183320]updated setup.py

Change-Id: I592af89486cb1d9e0b5197521c428920197a9103
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index f4706563d00..a2392358572 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 import os
 import re
+import subprocess
 import sys
 from io import open
 

From ee7e5c8f72db7d20df377cf19b26edfaa71892c5 Mon Sep 17 00:00:00 2001
From: Zhou Yuwen <zyuwen@habana.ai>
Date: Wed, 22 May 2024 07:39:06 +0000
Subject: [PATCH 03/51] [SW-177474] add HQT FP8 porting code

Change-Id: I4676f13a5ed43c444f2ec68675cc41335e7234dd
Signed-off-by: Zhou Yuwen <zyuwen@habana.ai>
---
 .../quantization/habana_fp8/README.md         |   24 -
 .../models/configuration_chatglm.py           |   61 -
 .../habana_fp8/models/modeling_chatglm.py     | 1294 -----------------
 .../habana_fp8/models/modeling_llama.py       | 1263 ----------------
 .../models/tokenization_baichuan.py           |  255 ----
 .../quantization/habana_fp8/requirement.txt   |    7 -
 .../quantization/habana_fp8/run_llm.py        |  222 ---
 .../quantization/habana_fp8/utils.py          |  255 ----
 examples/fp8_sample/README.md                 |   96 ++
 examples/fp8_sample/maxabs_measure.json       |    7 +
 examples/fp8_sample/maxabs_quant.json         |    8 +
 examples/fp8_sample/quant_config.json         |    8 +
 examples/fp8_sample/sample_one_step.py        |   57 +
 examples/fp8_sample/sample_two_steps.py       |   50 +
 .../{habana_fp8 => fp8_quant}/__init__.py     |    9 +-
 .../torch/algorithms/fp8_quant/common.py      |   98 ++
 .../torch/algorithms/fp8_quant/fp8_quant.py   |   61 +
 .../algorithms/fp8_quant/helper_modules.py    |  118 ++
 .../torch/algorithms/habana_fp8/fp8_quant.py  |  220 ---
 .../torch/algorithms/habana_fp8/modules.py    |  487 -------
 .../torch/algorithms/habana_fp8/observer.py   |  440 ------
 .../torch/algorithms/habana_fp8/save_load.py  |  105 --
 .../torch/algorithms/habana_fp8/scale.py      |   59 -
 .../algorithms/habana_fp8/tensor/__init__.py  |   13 -
 .../algorithms/habana_fp8/tensor/convert.cpp  |   63 -
 neural_compressor/torch/amp/__init__.py       |   15 -
 neural_compressor/torch/amp/autocast.py       |   95 --
 neural_compressor/torch/amp/fp8/__init__.py   |   13 -
 neural_compressor/torch/amp/fp8/functions.py  |  134 --
 .../torch/quantization/__init__.py            |    2 +-
 .../torch/quantization/algorithm_entry.py     |   30 +-
 .../torch/quantization/config.py              |  161 +-
 .../torch/quantization/quantize.py            |   28 +-
 setup.py                                      |    3 +-
 test/3x/torch/amp/test_fp8_amp.py             |   75 -
 .../torch/quantization/habana_fp8/test_fp8.py |  189 ---
 36 files changed, 660 insertions(+), 5365 deletions(-)
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
 delete mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py
 create mode 100644 examples/fp8_sample/README.md
 create mode 100644 examples/fp8_sample/maxabs_measure.json
 create mode 100644 examples/fp8_sample/maxabs_quant.json
 create mode 100644 examples/fp8_sample/quant_config.json
 create mode 100644 examples/fp8_sample/sample_one_step.py
 create mode 100644 examples/fp8_sample/sample_two_steps.py
 rename neural_compressor/torch/algorithms/{habana_fp8 => fp8_quant}/__init__.py (70%)
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/common.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
 delete mode 100644 neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py
 delete mode 100644 neural_compressor/torch/algorithms/habana_fp8/modules.py
 delete mode 100644 neural_compressor/torch/algorithms/habana_fp8/observer.py
 delete mode 100644 neural_compressor/torch/algorithms/habana_fp8/save_load.py
 delete mode 100644 neural_compressor/torch/algorithms/habana_fp8/scale.py
 delete mode 100644 neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py
 delete mode 100644 neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp
 delete mode 100644 neural_compressor/torch/amp/__init__.py
 delete mode 100644 neural_compressor/torch/amp/autocast.py
 delete mode 100644 neural_compressor/torch/amp/fp8/__init__.py
 delete mode 100644 neural_compressor/torch/amp/fp8/functions.py
 delete mode 100644 test/3x/torch/amp/test_fp8_amp.py
 delete mode 100644 test/3x/torch/quantization/habana_fp8/test_fp8.py

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md
deleted file mode 100644
index eb39321b173..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# Run
-
-## Run FP32 model
-``` python
-python run_llm.py --model [model_name_or_path] --to_graph [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-
-## Run BF16/FP16 model
-``` python
-python run_llm.py --model [model_name_or_path] --approach cast --precision [bf16|fp16]  --to_graph  [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-
-## Run FP8 model
-``` python
-python run_llm.py --model [model_name_or_path] --approach [dynamic|static|cast] --precision [fp8_e4m3|fp8_e5m2] --to_graph  [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-
-# Multi-card Inference
-With deepspeed we can leverage multi-cards inference with a prefix in command, below it's a demonstration of 4 card inference.
-
-```python
-deepspeed --num_gpus=4 run_llm.py --model [model_name_or_path] --approach [dynamic|static|cast] --precision [fp8_e4m3|fp8_e5m2] --to_graph  [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10]
-```
-deepspeed --num_gpus=4 run_llm.py --model facebook/opt-125m --approach static --precision fp8_e4m3 --to_graph  --accuracy --tasks lambada_openai --batch_size 8
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py
deleted file mode 100644
index 35600185f5a..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from transformers import PretrainedConfig
-
-
-class ChatGLMConfig(PretrainedConfig):
-    model_type = "chatglm"
-    def __init__(
-        self,
-        num_layers=28,
-        padded_vocab_size=65024,
-        hidden_size=4096,
-        ffn_hidden_size=13696,
-        kv_channels=128,
-        num_attention_heads=32,
-        seq_length=2048,
-        hidden_dropout=0.0,
-        classifier_dropout=None,
-        attention_dropout=0.0,
-        layernorm_epsilon=1e-5,
-        rmsnorm=True,
-        apply_residual_connection_post_layernorm=False,
-        post_layer_norm=True,
-        add_bias_linear=False,
-        add_qkv_bias=False,
-        bias_dropout_fusion=True,
-        multi_query_attention=False,
-        multi_query_group_num=1,
-        apply_query_key_layer_scaling=True,
-        attention_softmax_in_fp32=True,
-        fp32_residual_connection=False,
-        quantization_bit=0,
-        pre_seq_len=None,
-        prefix_projection=False,
-        **kwargs
-    ):
-        self.num_layers = num_layers
-        self.vocab_size = padded_vocab_size
-        self.padded_vocab_size = padded_vocab_size
-        self.hidden_size = hidden_size
-        self.ffn_hidden_size = ffn_hidden_size
-        self.kv_channels = kv_channels
-        self.num_attention_heads = num_attention_heads
-        self.seq_length = seq_length
-        self.hidden_dropout = hidden_dropout
-        self.classifier_dropout = classifier_dropout
-        self.attention_dropout = attention_dropout
-        self.layernorm_epsilon = layernorm_epsilon
-        self.rmsnorm = rmsnorm
-        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
-        self.post_layer_norm = post_layer_norm
-        self.add_bias_linear = add_bias_linear
-        self.add_qkv_bias = add_qkv_bias
-        self.bias_dropout_fusion = bias_dropout_fusion
-        self.multi_query_attention = multi_query_attention
-        self.multi_query_group_num = multi_query_group_num
-        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
-        self.fp32_residual_connection = fp32_residual_connection
-        self.quantization_bit = quantization_bit
-        self.pre_seq_len = pre_seq_len
-        self.prefix_projection = prefix_projection
-        super().__init__(**kwargs)
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py
deleted file mode 100644
index be1cd520af5..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py
+++ /dev/null
@@ -1,1294 +0,0 @@
-""" PyTorch ChatGLM model. """
-
-import math
-import copy
-import warnings
-import re
-import sys
-
-import torch
-import torch.utils.checkpoint
-import torch.nn.functional as F
-from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
-from torch.nn.utils import skip_init
-from typing import Optional, Tuple, Union, List, Callable, Dict, Any
-from copy import deepcopy
-
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
-)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import logging
-from transformers.generation.logits_process import LogitsProcessor
-from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
-
-from .configuration_chatglm import ChatGLMConfig
-
-# flags required to enable jit fusion kernels
-
-if sys.platform != 'darwin':
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_override_can_fuse_on_cpu(True)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
-_CONFIG_FOR_DOC = "ChatGLMConfig"
-
-CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "THUDM/chatglm3-6b",
-    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
-]
-
-
-def default_init(cls, *args, **kwargs):
-    return cls(*args, **kwargs)
-
-
-class InvalidScoreLogitsProcessor(LogitsProcessor):
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        if torch.isnan(scores).any() or torch.isinf(scores).any():
-            scores.zero_()
-            scores[..., 5] = 5e4
-        return scores
-
-
-class PrefixEncoder(torch.nn.Module):
-    """
-    The torch.nn model to encode the prefix
-    Input shape: (batch-size, prefix-length)
-    Output shape: (batch-size, prefix-length, 2*layers*hidden)
-    """
-
-    def __init__(self, config: ChatGLMConfig):
-        super().__init__()
-        self.prefix_projection = config.prefix_projection
-        if self.prefix_projection:
-            # Use a two-layer MLP to encode the prefix
-            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
-            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
-            self.trans = torch.nn.Sequential(
-                torch.nn.Linear(kv_size, config.hidden_size),
-                torch.nn.Tanh(),
-                torch.nn.Linear(config.hidden_size, kv_size)
-            )
-        else:
-            self.embedding = torch.nn.Embedding(config.pre_seq_len,
-                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
-
-    def forward(self, prefix: torch.Tensor):
-        if self.prefix_projection:
-            prefix_tokens = self.embedding(prefix)
-            past_key_values = self.trans(prefix_tokens)
-        else:
-            past_key_values = self.embedding(prefix)
-        return past_key_values
-
-
-def split_tensor_along_last_dim(
-        tensor: torch.Tensor,
-        num_partitions: int,
-        contiguous_split_chunks: bool = False,
-) -> List[torch.Tensor]:
-    """Split a tensor along its last dimension.
-
-    Arguments:
-        tensor: input tensor.
-        num_partitions: number of partitions to split the tensor
-        contiguous_split_chunks: If True, make each chunk contiguous
-                                 in memory.
-
-    Returns:
-        A list of Tensors
-    """
-    # Get the size and dimension.
-    last_dim = tensor.dim() - 1
-    last_dim_size = tensor.size()[last_dim] // num_partitions
-    # Split.
-    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
-    # Note: torch.split does not create contiguous tensors by default.
-    if contiguous_split_chunks:
-        return tuple(chunk.contiguous() for chunk in tensor_list)
-
-    return tensor_list
-
-
-class RotaryEmbedding(nn.Module):
-    def __init__(self, dim, original_impl=False, device=None, dtype=None):
-        super().__init__()
-        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-        self.dim = dim
-        self.original_impl = original_impl
-
-    def forward_impl(
-            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
-    ):
-        """Enhanced Transformer with Rotary Position Embedding.
-
-        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
-        transformers/rope/__init__.py. MIT License:
-        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
-        """
-        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
-        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
-
-        # Create position indexes `[0, 1, ..., seq_len - 1]`
-        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
-
-        # Calculate the product of position index and $\theta_i$
-        idx_theta = torch.outer(seq_idx, theta).float()
-
-        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
-
-        # this is to mimic the behaviour of complex32, else we will get different results
-        if dtype in (torch.float16, torch.bfloat16, torch.int8):
-            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
-        return cache
-
-    def forward(self, max_seq_len, offset=0):
-        return self.forward_impl(
-            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
-        )
-
-### INC change ###
-# @torch.jit.script
-
-def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
-    # x: [sq, b, np, hn]
-    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
-    rot_dim = rope_cache.shape[-2] * 2
-    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
-    # truncate to support variable sizes
-    rope_cache = rope_cache[:sq]
-    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
-    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
-    x_out2 = torch.stack(
-        [
-            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
-            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
-        ],
-        -1,
-    )
-    x_out2 = x_out2.flatten(3)
-    return torch.cat((x_out2, x_pass), dim=-1)
-
-
-class RMSNorm(torch.nn.Module):
-    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
-        self.eps = eps
-
-    def forward(self, hidden_states: torch.Tensor):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-
-        return (self.weight * hidden_states).to(input_dtype)
-
-
-class CoreAttention(torch.nn.Module):
-    def __init__(self, config: ChatGLMConfig, layer_number):
-        super(CoreAttention, self).__init__()
-
-        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
-        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
-        if self.apply_query_key_layer_scaling:
-            self.attention_softmax_in_fp32 = True
-        self.layer_number = max(1, layer_number)
-
-        projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_partition = projection_size
-        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        coeff = None
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-        if self.apply_query_key_layer_scaling:
-            coeff = self.layer_number
-            self.norm_factor *= coeff
-        self.coeff = coeff
-
-        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
-
-    def forward(self, query_layer, key_layer, value_layer, attention_mask):
-        pytorch_major_version = int(torch.__version__.split('.')[0])
-        if pytorch_major_version >= 2:
-            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
-            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 is_causal=True)
-            else:
-                if attention_mask is not None:
-                    attention_mask = ~attention_mask
-                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
-                                                                                 attention_mask)
-            context_layer = context_layer.permute(2, 0, 1, 3)
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.reshape(*new_context_layer_shape)
-        else:
-            # Raw attention scores
-
-            # [b, np, sq, sk]
-            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
-
-            # [sq, b, np, hn] -> [sq, b * np, hn]
-            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
-            # [sk, b, np, hn] -> [sk, b * np, hn]
-            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
-            # preallocting input tensor: [b * np, sq, sk]
-            matmul_input_buffer = torch.empty(
-                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
-                device=query_layer.device
-            )
-
-            # Raw attention scores. [b * np, sq, sk]
-            matmul_result = torch.baddbmm(
-                matmul_input_buffer,
-                query_layer.transpose(0, 1),  # [b * np, sq, hn]
-                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-                beta=0.0,
-                alpha=(1.0 / self.norm_factor),
-            )
-
-            # change view to [b, np, sq, sk]
-            attention_scores = matmul_result.view(*output_size)
-
-            # ===========================
-            # Attention probs and dropout
-            # ===========================
-
-            # attention scores and attention mask [b, np, sq, sk]
-            if self.attention_softmax_in_fp32:
-                attention_scores = attention_scores.float()
-            if self.coeff is not None:
-                attention_scores = attention_scores * self.coeff
-            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
-                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
-                                            device=attention_scores.device, dtype=torch.bool)
-                attention_mask.tril_()
-                attention_mask = ~attention_mask
-            if attention_mask is not None:
-                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
-            attention_probs = F.softmax(attention_scores, dim=-1)
-            attention_probs = attention_probs.type_as(value_layer)
-
-            # This is actually dropping out entire tokens to attend to, which might
-            # seem a bit unusual, but is taken from the original Transformer paper.
-            attention_probs = self.attention_dropout(attention_probs)
-            # =========================
-            # Context layer. [sq, b, hp]
-            # =========================
-
-            # value_layer -> context layer.
-            # [sk, b, np, hn] --> [b, np, sq, hn]
-
-            # context layer shape: [b, np, sq, hn]
-            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
-            # change view [sk, b * np, hn]
-            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
-            # change view [b * np, sq, sk]
-            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
-            # matmul: [b * np, sq, hn]
-            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
-            # change view [b, np, sq, hn]
-            context_layer = context_layer.view(*output_size)
-            # [b, np, sq, hn] --> [sq, b, np, hn]
-            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-            # [sq, b, np, hn] --> [sq, b, hp]
-            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
-            context_layer = context_layer.view(*new_context_layer_shape)
-
-        return context_layer
-
-
-class SelfAttention(torch.nn.Module):
-    """Parallel self-attention layer abstract class.
-
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(SelfAttention, self).__init__()
-        self.layer_number = max(1, layer_number)
-
-        self.projection_size = config.kv_channels * config.num_attention_heads
-
-        # Per attention head and per partition values.
-        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
-        self.num_attention_heads_per_partition = config.num_attention_heads
-
-        self.multi_query_attention = config.multi_query_attention
-        self.qkv_hidden_size = 3 * self.projection_size
-        if self.multi_query_attention:
-            self.num_multi_query_groups_per_partition = config.multi_query_group_num
-            self.qkv_hidden_size = (
-                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
-            )
-        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
-                                         bias=config.add_bias_linear or config.add_qkv_bias,
-                                         device=device, **_config_to_kwargs(config)
-                                         )
-
-        self.core_attention = CoreAttention(config, self.layer_number)
-
-        # Output.
-        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
-                               device=device, **_config_to_kwargs(config)
-                               )
-
-    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
-        if self.multi_query_attention:
-            num_attention_heads = self.num_multi_query_groups_per_partition
-        else:
-            num_attention_heads = self.num_attention_heads_per_partition
-        return torch.empty(
-            inference_max_sequence_len,
-            batch_size,
-            num_attention_heads,
-            self.hidden_size_per_attention_head,
-            dtype=dtype,
-            device=device,
-        )
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
-    ):
-        # hidden_states: [sq, b, h]
-
-        # =================================================
-        # Pre-allocate memory for key-values for inference.
-        # =================================================
-        # =====================
-        # Query, Key, and Value
-        # =====================
-
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer = self.query_key_value(hidden_states)
-
-        if self.multi_query_attention:
-            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
-                [
-                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
-                ],
-                dim=-1,
-            )
-            query_layer = query_layer.view(
-                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            key_layer = key_layer.view(
-                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.view(
-                value_layer.size()[:-1]
-                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
-            )
-        else:
-            new_tensor_shape = mixed_x_layer.size()[:-1] + \
-                               (self.num_attention_heads_per_partition,
-                                3 * self.hidden_size_per_attention_head)
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
-
-        # apply relative positional encoding (rotary embedding)
-        if rotary_pos_emb is not None:
-            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
-            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
-
-        # adjust key and value for inference
-        if kv_cache is not None:
-            cache_k, cache_v = kv_cache
-            key_layer = torch.cat((cache_k, key_layer), dim=0)
-            value_layer = torch.cat((cache_v, value_layer), dim=0)
-        if use_cache:
-            kv_cache = (key_layer, value_layer)
-        else:
-            kv_cache = None
-
-        if self.multi_query_attention:
-            key_layer = key_layer.unsqueeze(-2)
-            key_layer = key_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            key_layer = key_layer.contiguous().view(
-                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-            value_layer = value_layer.unsqueeze(-2)
-            value_layer = value_layer.expand(
-                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
-            )
-            value_layer = value_layer.contiguous().view(
-                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
-            )
-
-        # ==================================
-        # core attention computation
-        # ==================================
-
-        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
-
-        # =================
-        # Output. [sq, b, h]
-        # =================
-
-        output = self.dense(context_layer)
-
-        return output, kv_cache
-
-
-def _config_to_kwargs(args):
-    common_kwargs = {
-        "dtype": args.torch_dtype,
-    }
-    return common_kwargs
-
-
-class MLP(torch.nn.Module):
-    """MLP.
-
-    MLP will take the input with h hidden state, project it to 4*h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension.
-    """
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(MLP, self).__init__()
-
-        self.add_bias = config.add_bias_linear
-
-        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
-        self.dense_h_to_4h = nn.Linear(
-            config.hidden_size,
-            config.ffn_hidden_size * 2,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-        def swiglu(x):
-            x = torch.chunk(x, 2, dim=-1)
-            return F.silu(x[0]) * x[1]
-
-        self.activation_func = swiglu
-
-        # Project back to h.
-        self.dense_4h_to_h = nn.Linear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=self.add_bias,
-            device=device,
-            **_config_to_kwargs(config)
-        )
-
-    def forward(self, hidden_states):
-        # [s, b, 4hp]
-        intermediate_parallel = self.dense_h_to_4h(hidden_states)
-        intermediate_parallel = self.activation_func(intermediate_parallel)
-        # [s, b, h]
-        output = self.dense_4h_to_h(intermediate_parallel)
-        return output
-
-
-class GLMBlock(torch.nn.Module):
-    """A single transformer layer.
-
-    Transformer layer takes input with size [s, b, h] and returns an
-    output of the same size.
-    """
-
-    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
-        super(GLMBlock, self).__init__()
-        self.layer_number = layer_number
-
-        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-        # Layernorm on the input data.
-        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                             dtype=config.torch_dtype)
-
-        # Self attention.
-        self.self_attention = SelfAttention(config, layer_number, device=device)
-        self.hidden_dropout = config.hidden_dropout
-
-        # Layernorm on the attention output
-        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                      dtype=config.torch_dtype)
-
-        # MLP
-        self.mlp = MLP(config, device=device)
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
-    ):
-        # hidden_states: [s, b, h]
-
-        # Layer norm at the beginning of the transformer layer.
-        layernorm_output = self.input_layernorm(hidden_states)
-        # Self attention.
-        attention_output, kv_cache = self.self_attention(
-            layernorm_output,
-            attention_mask,
-            rotary_pos_emb,
-            kv_cache=kv_cache,
-            use_cache=use_cache
-        )
-
-        # Residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = hidden_states
-
-        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
-        layernorm_input = residual + layernorm_input
-
-        # Layer norm post the self attention.
-        layernorm_output = self.post_attention_layernorm(layernorm_input)
-
-        # MLP.
-        mlp_output = self.mlp(layernorm_output)
-
-        # Second residual connection.
-        if self.apply_residual_connection_post_layernorm:
-            residual = layernorm_output
-        else:
-            residual = layernorm_input
-
-        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
-        output = residual + output
-
-        return output, kv_cache
-
-
-class GLMTransformer(torch.nn.Module):
-    """Transformer class."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(GLMTransformer, self).__init__()
-
-        self.fp32_residual_connection = config.fp32_residual_connection
-        self.post_layer_norm = config.post_layer_norm
-
-        # Number of layers.
-        self.num_layers = config.num_layers
-
-        # Transformer layers.
-        def build_layer(layer_number):
-            return GLMBlock(config, layer_number, device=device)
-
-        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
-
-        if self.post_layer_norm:
-            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
-            # Final layer norm before output.
-            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
-                                                 dtype=config.torch_dtype)
-
-        self.gradient_checkpointing = False
-
-    def _get_layer(self, layer_number):
-        return self.layers[layer_number]
-
-    def forward(
-            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
-            use_cache: Optional[bool] = True,
-            output_hidden_states: Optional[bool] = False,
-    ):
-        if not kv_caches:
-            kv_caches = [None for _ in range(self.num_layers)]
-        presents = () if use_cache else None
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        all_self_attentions = None
-        all_hidden_states = () if output_hidden_states else None
-        for index in range(self.num_layers):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer = self._get_layer(index)
-            if self.gradient_checkpointing and self.training:
-                layer_ret = torch.utils.checkpoint.checkpoint(
-                    layer,
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_caches[index],
-                    use_cache
-                )
-            else:
-                layer_ret = layer(
-                    hidden_states,
-                    attention_mask,
-                    rotary_pos_emb,
-                    kv_cache=kv_caches[index],
-                    use_cache=use_cache
-                )
-            hidden_states, kv_cache = layer_ret
-            if use_cache:
-                presents = presents + (kv_cache,)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        # Final layer norm.
-        if self.post_layer_norm:
-            hidden_states = self.final_layernorm(hidden_states)
-
-        return hidden_states, presents, all_hidden_states, all_self_attentions
-
-
-class ChatGLMPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
-    """
-
-    is_parallelizable = False
-    supports_gradient_checkpointing = True
-    config_class = ChatGLMConfig
-    base_model_prefix = "transformer"
-    _no_split_modules = ["GLMBlock"]
-
-    def _init_weights(self, module: nn.Module):
-        """Initialize the weights."""
-        return
-
-    def get_masks(self, input_ids, past_key_values, padding_mask=None):
-        batch_size, seq_length = input_ids.shape
-        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
-        full_attention_mask.tril_()
-        past_length = 0
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[0]
-        if past_length:
-            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
-                                                        device=input_ids.device), full_attention_mask), dim=-1)
-        if padding_mask is not None:
-            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
-        if not past_length and padding_mask is not None:
-            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
-        full_attention_mask = (full_attention_mask < 0.5).bool()
-        full_attention_mask.unsqueeze_(1)
-        return full_attention_mask
-
-    def get_position_ids(self, input_ids, device):
-        batch_size, seq_length = input_ids.shape
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
-        return position_ids
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, GLMTransformer):
-            module.gradient_checkpointing = value
-
-
-class Embedding(torch.nn.Module):
-    """Language model embeddings."""
-
-    def __init__(self, config: ChatGLMConfig, device=None):
-        super(Embedding, self).__init__()
-
-        self.hidden_size = config.hidden_size
-        # Word embeddings (parallel).
-        self.word_embeddings = nn.Embedding(
-            config.padded_vocab_size,
-            self.hidden_size,
-            dtype=config.torch_dtype,
-            device=device
-        )
-        self.fp32_residual_connection = config.fp32_residual_connection
-
-    def forward(self, input_ids):
-        # Embeddings.
-        words_embeddings = self.word_embeddings(input_ids)
-        embeddings = words_embeddings
-        # Data format change to avoid explicit transposes : [b s h] --> [s b h].
-        embeddings = embeddings.transpose(0, 1).contiguous()
-        # If the input flag for fp32 residual connection is set, convert for float.
-        if self.fp32_residual_connection:
-            embeddings = embeddings.float()
-        return embeddings
-
-
-class ChatGLMModel(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
-        super().__init__(config)
-        if empty_init:
-            init_method = skip_init
-        else:
-            init_method = default_init
-        init_kwargs = {}
-        if device is not None:
-            init_kwargs["device"] = device
-        self.embedding = init_method(Embedding, config, **init_kwargs)
-        self.num_layers = config.num_layers
-        self.multi_query_group_num = config.multi_query_group_num
-        self.kv_channels = config.kv_channels
-
-        # Rotary positional embeddings
-        self.seq_length = config.seq_length
-        rotary_dim = (
-            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
-        )
-
-        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
-                                              dtype=config.torch_dtype)
-        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
-        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
-                                        dtype=config.torch_dtype, **init_kwargs)
-        self.pre_seq_len = config.pre_seq_len
-        self.prefix_projection = config.prefix_projection
-        if self.pre_seq_len is not None:
-            for param in self.parameters():
-                param.requires_grad = False
-            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
-            self.prefix_encoder = PrefixEncoder(config)
-            self.dropout = torch.nn.Dropout(0.1)
-
-    def get_input_embeddings(self):
-        return self.embedding.word_embeddings
-
-    def get_prompt(self, batch_size, device, dtype=torch.half):
-        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
-        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
-        past_key_values = past_key_values.view(
-            batch_size,
-            self.pre_seq_len,
-            self.num_layers * 2,
-            self.multi_query_group_num,
-            self.kv_channels
-        )
-        # seq_len, b, nh, hidden_size
-        past_key_values = self.dropout(past_key_values)
-        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
-        return past_key_values
-
-    def forward(
-            self,
-            input_ids,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.BoolTensor] = None,
-            full_attention_mask: Optional[torch.BoolTensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        batch_size, seq_length = input_ids.shape
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embedding(input_ids)
-
-        if self.pre_seq_len is not None:
-            if past_key_values is None:
-                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
-                                                  dtype=inputs_embeds.dtype)
-            if attention_mask is not None:
-                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
-                                            attention_mask], dim=-1)
-
-        if full_attention_mask is None:
-            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
-                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
-
-        # Rotary positional embeddings
-        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
-        if position_ids is not None:
-            rotary_pos_emb = rotary_pos_emb[position_ids]
-        else:
-            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
-        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
-
-        # Run encoder.
-        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
-            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
-            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
-        )
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
-
-    def quantize(self, weight_bit_width: int):
-        from .quantization import quantize
-        quantize(self.encoder, weight_bit_width)
-        return self
-
-
-class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.max_sequence_length = config.max_length
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-        self.config = config
-        self.quantized = False
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def _update_model_kwargs_for_generation(
-            self,
-            outputs: ModelOutput,
-            model_kwargs: Dict[str, Any],
-            is_encoder_decoder: bool = False,
-            standardize_cache_format: bool = False,
-    ) -> Dict[str, Any]:
-        # update past_key_values
-        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
-            outputs, standardize_cache_format=standardize_cache_format
-        )
-
-        # update attention mask
-        if "attention_mask" in model_kwargs:
-            attention_mask = model_kwargs["attention_mask"]
-            model_kwargs["attention_mask"] = torch.cat(
-                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-            )
-
-        # update position ids
-        if "position_ids" in model_kwargs:
-            position_ids = model_kwargs["position_ids"]
-            new_position_id = position_ids[..., -1:].clone()
-            new_position_id += 1
-            model_kwargs["position_ids"] = torch.cat(
-                [position_ids, new_position_id], dim=-1
-            )
-
-        model_kwargs["is_first_forward"] = False
-        return model_kwargs
-
-    def prepare_inputs_for_generation(
-            self,
-            input_ids: torch.LongTensor,
-            past_key_values: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            is_first_forward: bool = True,
-            **kwargs
-    ) -> dict:
-        # only last token for input_ids if past is not None
-        if position_ids is None:
-            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
-        if not is_first_forward:
-            if past_key_values is not None:
-                position_ids = position_ids[..., -1:]
-                input_ids = input_ids[:, -1:]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "return_last_logit": True,
-            "use_cache": use_cache
-        }
-
-    def forward(
-            self,
-            input_ids: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.Tensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.Tensor] = None,
-            labels: Optional[torch.Tensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-            return_last_logit: Optional[bool] = False,
-    ):
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        if return_last_logit:
-            hidden_states = hidden_states[-1:]
-        lm_logits = self.transformer.output_layer(hidden_states)
-        lm_logits = lm_logits.transpose(0, 1).contiguous()
-
-        loss = None
-        if labels is not None:
-            lm_logits = lm_logits.to(torch.float32)
-
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-            lm_logits = lm_logits.to(hidden_states.dtype)
-            loss = loss.to(hidden_states.dtype)
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-    @staticmethod
-    def _reorder_cache(
-            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
-        """
-        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
-        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
-        beam_idx at every generation step.
-
-        Output shares the same memory storage as `past`.
-        """
-        return tuple(
-            (
-                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
-                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
-            )
-            for layer_past in past
-        )
-
-    def process_response(self, output, history):
-        content = ""
-        history = deepcopy(history)
-        for response in output.split("<|assistant|>"):
-            metadata, content = response.split("\n", maxsplit=1)
-            if not metadata.strip():
-                content = content.strip()
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                content = content.replace("[[训练时间]]", "2023年")
-            else:
-                history.append({"role": "assistant", "metadata": metadata, "content": content})
-                if history[0]["role"] == "system" and "tools" in history[0]:
-                    content = "\n".join(content.split("\n")[1:-1])
-                    def tool_call(**kwargs):
-                        return kwargs
-                    parameters = eval(content)
-                    content = {"name": metadata.strip(), "parameters": parameters}
-                else:
-                    content = {"name": metadata.strip(), "content": content}
-        return content, history
-
-    @torch.inference_mode()
-    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
-             **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        inputs = tokenizer.build_chat_input(query, history=history, role=role)
-        inputs = inputs.to(self.device)
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
-                        tokenizer.get_command("<|observation|>")]
-        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-        response = tokenizer.decode(outputs)
-        history.append({"role": role, "content": query})
-        response, history = self.process_response(response, history)
-        return response, history
-
-    @torch.inference_mode()
-    def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
-                    logits_processor=None, return_past_key_values=False, **kwargs):
-        if history is None:
-            history = []
-        if logits_processor is None:
-            logits_processor = LogitsProcessorList()
-        logits_processor.append(InvalidScoreLogitsProcessor())
-        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
-                        tokenizer.get_command("<|observation|>")]
-        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
-                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
-        if past_key_values is None:
-            inputs = tokenizer.build_chat_input(query, history=history, role=role)
-        else:
-            inputs = tokenizer.build_chat_input(query, role=role)
-        inputs = inputs.to(self.device)
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[0]
-            if self.transformer.pre_seq_len is not None:
-                past_length -= self.transformer.pre_seq_len
-            inputs.position_ids += past_length
-            attention_mask = inputs.attention_mask
-            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
-            inputs['attention_mask'] = attention_mask
-        history.append({"role": role, "content": query})
-        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
-                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
-                                            **gen_kwargs):
-            if return_past_key_values:
-                outputs, past_key_values = outputs
-            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-            response = tokenizer.decode(outputs)
-            if response and response[-1] != "�":
-                response, new_history = self.process_response(response, history)
-                if return_past_key_values:
-                    yield response, new_history, past_key_values
-                else:
-                    yield response, new_history
-
-    @torch.inference_mode()
-    def stream_generate(
-            self,
-            input_ids,
-            generation_config: Optional[GenerationConfig] = None,
-            logits_processor: Optional[LogitsProcessorList] = None,
-            stopping_criteria: Optional[StoppingCriteriaList] = None,
-            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-            return_past_key_values=False,
-            **kwargs,
-    ):
-        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
-
-        if generation_config is None:
-            generation_config = self.generation_config
-        generation_config = copy.deepcopy(generation_config)
-        model_kwargs = generation_config.update(**kwargs)
-        model_kwargs["use_cache"] = generation_config.use_cache
-        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
-
-        if isinstance(eos_token_id, int):
-            eos_token_id = [eos_token_id]
-        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
-
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        if has_default_max_length and generation_config.max_new_tokens is None:
-            warnings.warn(
-                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
-                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
-                " recommend using `max_new_tokens` to control the maximum length of the generation.",
-                UserWarning,
-            )
-        elif generation_config.max_new_tokens is not None:
-            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
-            if not has_default_max_length:
-                logger.warn(
-                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
-                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
-                    "Please refer to the documentation for more information. "
-                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
-                    UserWarning,
-                )
-
-        if input_ids_seq_length >= generation_config.max_length:
-            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-            logger.warning(
-                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
-                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                " increasing `max_new_tokens`."
-            )
-
-        # 2. Set generation parameters if not already defined
-        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
-        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-
-        logits_processor = self._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_seq_length,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
-
-        stopping_criteria = self._get_stopping_criteria(
-            generation_config=generation_config, stopping_criteria=stopping_criteria
-        )
-        logits_warper = self._get_logits_warper(generation_config)
-
-        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-        scores = None
-        while True:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-            # forward pass to get next token
-            outputs = self(
-                **model_inputs,
-                return_dict=True,
-                output_attentions=False,
-                output_hidden_states=False,
-            )
-
-            next_token_logits = outputs.logits[:, -1, :]
-
-            # pre-process distribution
-            next_token_scores = logits_processor(input_ids, next_token_logits)
-            next_token_scores = logits_warper(input_ids, next_token_scores)
-
-            # sample
-            probs = nn.functional.softmax(next_token_scores, dim=-1)
-            if generation_config.do_sample:
-                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                next_tokens = torch.argmax(probs, dim=-1)
-            # update generated ids, model inputs, and length for next step
-            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-            model_kwargs = self._update_model_kwargs_for_generation(
-                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
-            )
-            unfinished_sequences = unfinished_sequences.mul(
-                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
-            )
-            if return_past_key_values:
-                yield input_ids, outputs.past_key_values
-            else:
-                yield input_ids
-            # stop when each sentence is finished, or if we exceed the maximum length
-            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-                break
-
-    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
-        if bits == 0:
-            return
-
-        from .quantization import quantize
-
-        if self.quantized:
-            logger.info("Already quantized.")
-            return self
-
-        self.quantized = True
-
-        self.config.quantization_bit = bits
-
-        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
-                                            **kwargs)
-        return self
-
-
-class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
-    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
-        super().__init__(config)
-
-        self.num_labels = config.num_labels
-        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
-
-        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
-        if config.classifier_dropout is not None:
-            self.dropout = nn.Dropout(config.classifier_dropout)
-        else:
-            self.dropout = None
-        self.config = config
-
-        if self.config.quantization_bit:
-            self.quantize(self.config.quantization_bit, empty_init=True)
-
-    def forward(
-            self,
-            input_ids: Optional[torch.LongTensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            full_attention_mask: Optional[torch.Tensor] = None,
-            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-            inputs_embeds: Optional[torch.LongTensor] = None,
-            labels: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            full_attention_mask=full_attention_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = transformer_outputs[0]
-        pooled_hidden_states = hidden_states[-1]
-        if self.dropout is not None:
-            pooled_hidden_states = self.dropout(pooled_hidden_states)
-        logits = self.classifier_head(pooled_hidden_states)
-
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits.float(), labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
-
-        if not return_dict:
-            output = (logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py
deleted file mode 100644
index 4cd1b6e18e8..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py
+++ /dev/null
@@ -1,1263 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch LLaMA model."""
-import math
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-
-from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.utils import (
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    logging,
-    replace_return_docstrings,
-)
-from transformers.utils.import_utils import is_torch_fx_available
-from transformers.models.llama.configuration_llama import LlamaConfig
-
-
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-### INC code ###
-from neural_compressor.torch.quantization.modules import Matmul, BatchMatmul, Autocast
-
-# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
-# It means that the function will not be traced through and simply appear as a node in the graph.
-if is_torch_fx_available():
-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    warnings.warn(
-        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask"
-    )
-    return AttentionMaskConverter._prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
-
-
-def _make_causal_mask(
-    input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
-):
-    warnings.warn(
-        "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask"
-    )
-    return AttentionMaskConverter._make_causal_mask(
-        input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
-    )
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
-ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
-
-
-class LlamaRotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-
-
-class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        t = t / self.scaling_factor
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
-    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
-
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
-        self.scaling_factor = scaling_factor
-        super().__init__(dim, max_position_embeddings, base, device)
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (
-                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
-            ) ** (self.dim / (self.dim - 2))
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        if self.config.pretraining_tp > 1:
-            slice = self.intermediate_size // self.config.pretraining_tp
-            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
-            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
-            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
-
-            gate_proj = torch.cat(
-                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
-            )
-            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
-
-            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
-            down_proj = [
-                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
-            ]
-            down_proj = sum(down_proj)
-        else:
-            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.config = config
-        self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
-
-        if (self.head_dim * self.num_heads) != self.hidden_size:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
-        self._init_rope()
-        ### INC code ###
-        self.matmul1 = Matmul()
-        self.matmul2 = Matmul()
-        self.cast1 = Autocast()
-        self.cast2 = Autocast()
-
-    def _init_rope(self):
-        if self.config.rope_scaling is None:
-            self.rotary_emb = LlamaRotaryEmbedding(
-                self.head_dim,
-                max_position_embeddings=self.max_position_embeddings,
-                base=self.rope_theta,
-            )
-        else:
-            scaling_type = self.config.rope_scaling["type"]
-            scaling_factor = self.config.rope_scaling["factor"]
-            if scaling_type == "linear":
-                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            elif scaling_type == "dynamic":
-                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
-                    self.head_dim,
-                    max_position_embeddings=self.max_position_embeddings,
-                    scaling_factor=scaling_factor,
-                    base=self.rope_theta,
-                )
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
-
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-            query_slices = self.q_proj.weight.split(
-                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-            )
-            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-            query_states = torch.cat(query_states, dim=-1)
-
-            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-            key_states = torch.cat(key_states, dim=-1)
-
-            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-            value_states = torch.cat(value_states, dim=-1)
-
-        else:
-            query_states = self.q_proj(hidden_states)
-            key_states = self.k_proj(hidden_states)
-            value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        ### INC code ###
-        key_states = self.cast1(key_states)
-        value_states = self.cast2(value_states)
-        # import habana_frameworks.torch.core as htcore
-        # htcore.mark_step()
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        ### INC code ###
-        attn_weights = self.matmul1(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-
-        ### INC code ###
-        attn_output = self.matmul2(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        if self.config.pretraining_tp > 1:
-            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
-        else:
-            attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class LlamaFlashAttention2(LlamaAttention):
-    """
-    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # LlamaFlashAttention2 attention does not support output_attentions
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
-
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
-
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            # Handle the case where the model is quantized
-            if hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-    def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=self.is_causal,
-            )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
-            )
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.self_attn = (
-            LlamaAttention(config=config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else LlamaFlashAttention2(config=config)
-        )
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*):
-                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
-                query_sequence_length, key_sequence_length)` if default attention is used.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        """
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape[:2]
-        elif inputs_embeds is not None:
-            batch_size, seq_length = inputs_embeds.shape[:2]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        past_key_values_length = 0
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if getattr(self.config, "_flash_attn_2_enabled", False):
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-            )
-
-        # embed positions
-        hidden_states = inputs_embeds
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
-
-        for idx, decoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    position_ids,
-                    past_key_value,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel):
-    _tied_weights_keys = ["lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-            }
-        )
-        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
-                    logits.device
-                )
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py
deleted file mode 100644
index 5b7054d3227..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright 2023 Baichuan Inc. All Rights Reserved.
-
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from shutil import copyfile
-from typing import Any, Dict, List, Optional, Tuple
-
-import sentencepiece as spm
-
-from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {},
-    "tokenizer_file": {},
-}
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
-
-
-class BaichuanTokenizer(PreTrainedTokenizer):
-    """
-    Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file,
-        unk_token="<unk>",
-        bos_token="<s>",
-        eos_token="</s>",
-        pad_token=None,
-        sp_model_kwargs: Optional[Dict[str, Any]] = None,
-        add_bos_token=True,
-        add_eos_token=False,
-        clean_up_tokenization_spaces=False,
-        **kwargs,
-    ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        ### INC code ###
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
-        super().__init__(
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        #self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        #self.sp_model.Load(vocab_file)
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_model.get_piece_size()
-
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text):
-        """Returns a tokenized string."""
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special and i != 0:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-
-    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        bos_token_id = [1] if self.add_bos_token else []
-        eos_token_id = [1] if self.add_eos_token else []
-
-        if token_ids_1 is None:
-            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        sequence pair mask has the following format:
-
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        if token_ids_1 is None, only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of ids.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
-
-        if token_ids_1 is not None:
-            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
-
-        return output
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt
deleted file mode 100644
index d3655acd742..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-transformers
-datasets
-accelerate
-SentencePiece
-lm_eval==0.3.0
-openpyxl
-einops
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
deleted file mode 100644
index e77ef2c6a33..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import os
-os.environ["EXPERIMENTAL_WEIGHT_SHARING"] = "False"
-
-### USE_GAUDI2_SCALE requires PT_USE_FP8_AMAX for torch.mm/bmm, or got failure
-# os.environ["USE_GAUDI2_SCALE"] = "True"
-# os.environ["PT_USE_FP8_AMAX"] = "True"
-
-### graphs will dump to .graph_dumps folder
-# os.environ["GRAPH_VISUALIZATION"] = "True"
-# import shutil
-# shutil.rmtree(".graph_dumps", ignore_errors=True)
-
-import argparse
-import time
-import json
-import re
-import torch
-import habana_frameworks.torch.hpex
-import torch.nn.functional as F
-import deepspeed
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
-import habana_frameworks.torch.core as htcore
-
-from utils import show_msg, eval_func, init_empty_model, init_model, init_tokenizer
-
-
-torch.set_grad_enabled(False)
-htcore.hpu_set_env()
-torch.device('hpu')
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--model", nargs="?", default="facebook/opt-125m"
-)
-parser.add_argument(
-    "--trust_remote_code", default=True,
-    help="Transformers parameter: use the external repo")
-parser.add_argument(
-    "--revision", default=None,
-    help="Transformers parameter: set the model hub commit number")
-parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
-parser.add_argument("--output_dir", nargs="?", default="./saved_results")
-parser.add_argument("--to_graph", action="store_true")
-parser.add_argument("--approach", type=str, default=None,
-                    help="Select from ['dynamic', 'static' 'cast']")
-parser.add_argument("--precision", type=str, default='fp32',
-                    help="Select from ['fp8_e4m3', 'fp8_e5m2', 'bf16', 'fp16', 'fp32'], \
-                        ['bf16', 'fp16'] only work with cast approach")
-parser.add_argument("--autotune", action="store_true")
-parser.add_argument("--accuracy", action="store_true")
-parser.add_argument("--performance", action="store_true")
-parser.add_argument("--generate", action="store_true")
-parser.add_argument("--skip_fp8_mm", action="store_true")
-parser.add_argument("--dump_to_excel", action="store_true")
-parser.add_argument("--save", action="store_true")
-parser.add_argument("--load", action="store_true")
-parser.add_argument("--batch_size", default=1, type=int,
-                    help="For accuracy measurement only.")
-parser.add_argument("--pad_max_length", default=512, type=int,
-                    help="Pad input ids to max length.")
-parser.add_argument("--calib_iters", default=100, type=int,
-                    help="calibration iters.")
-parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], \
-                    type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa",
-                                       "rte", "openbookqa", "lambada_standard", "wikitext"],
-                    help="tasks list for accuracy validation")
-parser.add_argument("--limit", default=None, type=int,
-                    help="the sample num of evaluation.")
-parser.add_argument("--max_new_tokens", default=100, type=int,
-                    help="calibration iters.")
-parser.add_argument('--buckets', type=int, nargs='+', \
-                    help="Input length buckets to use with static_shapes", default=[256, 512])
-parser.add_argument("--local_rank",
-                    type=int,
-                    default=-1,
-                    help="local_rank for distributed training on gpus")
-parser.add_argument("--skip_lm_head", action="store_true")
-args = parser.parse_args()
-
-
-world_size = int(os.getenv('WORLD_SIZE', '1'))
-local_rank = int(os.getenv('LOCAL_RANK', '-1'))
-
-
-if args.load:
-    user_model = init_empty_model(args.model)
-else:
-    user_model = init_model(args)
-user_model.eval()
-
-
-tokenizer = init_tokenizer(args)
-
-
-### dynamic & static quantization ###
-if args.approach in ["dynamic", "static"] and not args.load:
-    print("device:", next(user_model.parameters()).device)
-    from neural_compressor.torch.quantization import (
-        quantize, autotune, FP8Config, get_default_fp8_config, TuningConfig, get_default_fp8_config_set
-    )
-    dtype = args.precision
-    if args.approach == "dynamic":
-        from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
-        user_model = quantize_dynamic(user_model, dtype, inplace=True)
-    elif args.approach == "static":
-        qconfig = FP8Config(w_dtype=dtype, act_dtype=dtype, approach="static")
-        if args.skip_lm_head:
-            fp32_config = FP8Config(w_dtype="fp32", act_dtype="fp32")
-            qconfig.set_local("lm_head", fp32_config)
-        # dataset
-        from datasets import load_dataset
-        calib_dataset = load_dataset(args.dataset, split="train").select(range(100))
-        calib_dataset = calib_dataset.shuffle(seed=42)
-        calib_data = []
-        for examples in calib_dataset:
-            calib_data.append(
-                tokenizer(
-                    examples["text"],
-                    return_tensors="pt",
-                    max_length=64,
-                    padding="max_length",
-                    truncation=True
-                )
-            )
-
-        def calib_func(model):
-            for i, calib_input in enumerate(calib_data):
-                if i >= args.calib_iters:
-                    break
-                model(
-                    input_ids=calib_input["input_ids"].to('hpu'),
-                    attention_mask=calib_input["attention_mask"].to('hpu'),
-                )
-
-        user_model = quantize(user_model, qconfig, calib_func, inplace=True)
-        # saving
-        print(user_model)
-        if args.save and local_rank in [-1, 0]:
-            user_model.save("saved_results")
-
-
-if args.load:
-    from neural_compressor.torch.quantization import load
-    user_model = load("saved_results", user_model)
-
-
-if args.approach in ["dynamic", "static"] or args.load:
-    # It enables weights constant folding
-    from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const
-    _mark_params_as_const(user_model)  # can reduce memory allocated and speed up
-    _check_params_as_const(user_model)
-
-
-
-# If torch.matmul and torch.bmm are not replaced by INC module,
-# Below codes can make torch.matmul and torch.bmm run on fp8 by injection.
-if not args.skip_fp8_mm and args.precision in ['fp8_e4m3', 'fp8_e5m2']:
-    def replace_torch_mm_bmm():
-        from neural_compressor.torch.amp.fp8.functions import fp8_matmul
-        torch.matmul = fp8_matmul
-        torch.bmm = fp8_matmul
-
-    replace_torch_mm_bmm()
-
-
-# inference optimization
-if args.to_graph:
-    import habana_frameworks.torch.hpu.graphs as htgraphs
-    user_model = htgraphs.wrap_in_hpu_graph(user_model)
-
-
-# dump message of HPU after quantization or reloading
-show_msg()
-
-
-### generation, performance and accuracy validation ###
-if args.generate:
-    input_prompt = "Here is my prompt"
-    print("Prompt sentence:", input_prompt)
-    generation_config = {
-        "min_new_tokens": args.max_new_tokens, "max_new_tokens": args.max_new_tokens,
-        # "do_sample": False, "temperature": 0.9, "num_beams": 4,
-    }
-    input_tokens = tokenizer(input_prompt, return_tensors="pt").to('hpu')
-    eval_start = time.perf_counter()
-    if args.approach == "cast":
-        from neural_compressor.torch.amp import autocast
-        if args.precision == "fp8_e4m3":
-            dtype = torch.float8_e4m3fn
-        elif args.precision == "fp8_e5m2":
-            dtype = torch.float8_e5m2
-        elif args.precision == "fp16":
-            dtype = torch.float16
-        elif args.precision == "bf16":
-            dtype = torch.bfloat16
-        with autocast('hpu', dtype=dtype):
-            outputs = user_model.generate(**input_tokens, **generation_config)
-    else:
-        outputs = user_model.generate(**input_tokens, **generation_config)
-
-    output_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    eval_end = time.perf_counter()
-    print("Generated sentence:", output_sentence)
-    print("Duration:", eval_end - eval_start)
-
-
-if args.performance:
-    eval_start = time.perf_counter()
-    input_prompt = "Intel is a company which"
-    input_tokens = torch.ones((1, 128), dtype=torch.long).to('hpu')
-    generation_config = {"min_new_tokens": 100, "max_new_tokens": 100}
-    outputs = user_model.generate(input_tokens, **generation_config)
-    print("Duration of generating 100 tokens :", time.perf_counter() - eval_start)
-
-
-if args.accuracy:
-    eval_func(user_model, tokenizer=tokenizer, args=args)
-
-# dump final message of HPU
-show_msg()
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py
deleted file mode 100644
index 843287cddfa..00000000000
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import os
-import re
-import torch
-from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
-
-
-world_size = int(os.getenv('WORLD_SIZE', '1'))
-local_rank = int(os.getenv('LOCAL_RANK', '-1'))
-
-
-def init_model(args):
-    import deepspeed
-    model_dtype = torch.float32
-    if re.search("llama", args.model.lower()) or re.search("bloom", args.model.lower()):
-        if world_size > 1:
-            config = AutoConfig.from_pretrained(args.model)
-            model_dtype = torch.bfloat16 # RuntimeErrorCastToFp8V2 input must be of float or bfloat16 dtype
-            deepspeed.init_distributed(dist_backend="hccl")
-            with deepspeed.OnDevice(dtype=model_dtype, device="meta"):
-                user_model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
-            import tempfile
-            checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w")
-            from optimum.habana.checkpoint_utils import write_checkpoints_json # in optimum-habana
-            write_checkpoints_json(
-                args.model,
-                local_rank,
-                checkpoints_json,
-                token=None,
-            )
-        else:
-            user_model = AutoModelForCausalLM.from_pretrained(
-                args.model,
-                device_map='hpu',
-                torch_dtype=model_dtype,
-            )
-    elif re.search("chatglm", args.model.lower()):
-        from models.modeling_chatglm import ChatGLMForConditionalGeneration
-        user_model = ChatGLMForConditionalGeneration.from_pretrained(
-            args.model,
-            revision=args.revision,
-            device_map='hpu',
-            torch_dtype=model_dtype,
-        )
-        # print(user_model.transformer.output_layer.weight.dtype) # always fp16
-        user_model.float() # static fp8 need float32 for graph compiler
-    else:
-        user_model = AutoModelForCausalLM.from_pretrained(
-            args.model,
-            trust_remote_code=args.trust_remote_code,
-            revision=args.revision,
-            device_map='hpu',
-            torch_dtype=model_dtype,
-        )
-    # load weight for multi-cards
-    if world_size > 1:
-        if re.search("llama", args.model.lower()) or re.search("bloom", args.model.lower()):
-            ds_inference_kwargs = {"dtype": model_dtype}
-            ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size}
-            ds_inference_kwargs["enable_cuda_graph"] = False
-            from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-            ds_inference_kwargs["injection_policy"] = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
-            ds_inference_kwargs["checkpoint"] = checkpoints_json.name
-            ds_model = deepspeed.init_inference(user_model, **ds_inference_kwargs)
-        else:
-            ds_model = deepspeed.init_inference(user_model,
-                                            mp_size=world_size,
-                                            replace_with_kernel_inject=False)
-        user_model = ds_model.module
-    return user_model
-
-
-def init_empty_model(model_name):
-    from accelerate import init_empty_weights
-    model_dtype = torch.float32
-    config = AutoConfig.from_pretrained(model_name)
-    with init_empty_weights():
-        model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype)
-    return model
-
-
-def init_tokenizer(args):
-    # tokenizer
-    if re.search("baichuan", args.model.lower()):
-        from models.tokenization_baichuan import BaichuanTokenizer
-        tokenizer = BaichuanTokenizer.from_pretrained(
-            args.model,
-            trust_remote_code=args.trust_remote_code
-        )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            args.model,
-            trust_remote_code=args.trust_remote_code
-        )
-    tokenizer.pad_token = tokenizer.eos_token
-    return tokenizer
-
-
-def show_msg():
-    import numpy as np
-    import glob
-    from habana_frameworks.torch.hpu import memory_stats
-    print("Number of HPU graphs:", len(glob.glob(".graph_dumps/*PreGraph*")))
-    mem_stats = memory_stats()
-    mem_dict = {
-        "memory_allocated (GB)": np.round(mem_stats["InUse"] / 1024**3, 2),
-        "max_memory_allocated (GB)": np.round(mem_stats["MaxInUse"] / 1024**3, 2),
-        "total_memory_available (GB)": np.round(mem_stats["Limit"] / 1024**3, 2),
-    }
-    for k, v in mem_dict.items():
-        print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v))
-
-
-def itrex_bootstrap_stderr(f, xs, iters):
-    from lm_eval.metrics import _bootstrap_internal, sample_stddev
-    res = []
-    chunk_size = min(1000, iters)
-    it = _bootstrap_internal(f, chunk_size)
-    for i in range(iters // chunk_size):
-        bootstrap = it((i, xs))
-        res.extend(bootstrap)
-    return sample_stddev(res)
-
-
-def save_to_excel(dict):
-    import pandas as pd
-    df_new = pd.DataFrame(dict)
-    try:
-        df_existing = pd.read_excel('output.xlsx')
-    except FileNotFoundError:
-        df_existing = pd.DataFrame()
-    df_combined = pd.concat([df_existing, df_new], axis=0, ignore_index=True)
-    df_combined.to_excel('output.xlsx', index=False, engine='openpyxl', header=True)
-
-
-def eval_func(user_model, tokenizer, args):
-    import os
-    import re
-    import time
-    import json
-    import torch
-    import habana_frameworks.torch.hpex
-    import torch.nn.functional as F
-    import lm_eval
-    import lm_eval.tasks
-    import lm_eval.evaluator
-
-    # to avoid out-of-memory caused by Popen for large language models.
-    lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr
-
-    class HabanaModelAdapter(lm_eval.base.BaseLM):
-        def __init__(self, tokenizer, model, args, options):
-            super().__init__()
-            self.tokenizer = tokenizer
-            self.model = model.eval()
-            self._batch_size = args.batch_size
-            self.buckets = list(sorted(args.buckets))
-            self.options = options
-            self._device = "hpu"
-            torch.set_grad_enabled(False)
-
-        @property
-        def eot_token_id(self):
-            return self.model.config.eos_token_id
-
-        @property
-        def max_length(self):
-            return self.buckets[-1]
-
-        @property
-        def max_gen_toks(self):
-            raise NotImplementedError()
-
-        @property
-        def batch_size(self):
-            return self._batch_size
-
-        @property
-        def device(self):
-            # We need to do padding ourselves, otherwise we'll end up with recompilations
-            # Returning 'cpu' to keep tensors on CPU in lm_eval code
-            return 'cpu' # 'hpu'
-
-        def tok_encode(self, string):
-            if (
-                re.search("chatglm3", args.model.lower()) or
-                re.search("llama", args.model.lower()) or
-                re.search("mistral", args.model.lower())
-            ):
-                string = string.lstrip()
-            return self.tokenizer.encode(string, add_special_tokens=False)
-
-        def tok_decode(self, tokens):
-            return self.tokenizer.decode(tokens, skip_special_tokens=True)
-
-        def _model_generate(self, context, max_length, eos_token_id):
-            raise NotImplementedError()
-
-        def find_bucket(self, length):
-            return [b for b in self.buckets if b >= length][0]
-
-        def _model_call(self, inputs):
-            seq_length = inputs.shape[-1]
-            padding_length = 0
-            bucket_length = self.find_bucket(seq_length)
-            padding_length = bucket_length - seq_length
-            inputs = F.pad(inputs, (0, padding_length), value=self.model.config.pad_token_id)
-            logits = self.model(inputs.to(self._device))["logits"].cpu()
-
-            if padding_length > 0:
-                logits = logits[:, :-padding_length, :]
-            logits = logits.to(torch.float32)
-            return logits
-
-    lm_tasks = lm_eval.tasks.get_task_dict(args.tasks)
-    options = None
-    lm = HabanaModelAdapter(tokenizer, user_model, args, options)
-
-    eval_start = time.perf_counter()
-    if args.approach == "cast":
-        from neural_compressor.torch.amp import autocast
-        if args.precision == "fp8_e4m3":
-            dtype = torch.float8_e4m3fn
-        elif args.precision == "fp8_e5m2":
-            dtype = torch.float8_e5m2
-        elif args.precision == "fp16":
-            dtype = torch.float16
-        elif args.precision == "bf16":
-            dtype = torch.bfloat16
-        with autocast('hpu', dtype=dtype):
-            results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
-    else:
-        results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit)
-    print(lm_eval.evaluator.make_table(results))
-    eval_end = time.perf_counter()
-    print("Duration:", eval_end - eval_start)
-    results['args'] = vars(args)
-    results['duration'] = eval_end - eval_start
-
-    # make sure that result is dumped only once during multi-cards evaluation
-    local_rank = int(os.getenv('LOCAL_RANK', '-1'))
-    if local_rank in [-1, 0]:
-        dumped = json.dumps(results, indent=2)
-        accu_dict = {}
-        case_name = str(args.approach) + "-" + args.precision
-        for task_name in args.tasks:
-            if task_name == "wikitext":
-                print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True)
-                accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]]
-            else:
-                print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True)
-                accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]]
-        accu_dict["duration"] = [args.model, case_name, results["duration"]]
-        if args.dump_to_excel:
-            save_to_excel(accu_dict)
-    return results["results"][task_name]["acc"]
diff --git a/examples/fp8_sample/README.md b/examples/fp8_sample/README.md
new file mode 100644
index 00000000000..b758768ef0f
--- /dev/null
+++ b/examples/fp8_sample/README.md
@@ -0,0 +1,96 @@
+### Usage demo:
+
+#### two steps to get quantized model
+
+```diff
+import torch
++ from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration
+import habana_frameworks.torch.core as htcore
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+model = M().eval()
+
++ config = FP8Config.from_json_file(args.quant_config) # args.quant_config is the path of json file
+
++ if config.measure:
++   model = prepare(model, config)
+
++ if config.quantize:
++     htcore.hpu_initialize()
++     model = convert(model, config)
+
+# user code run
+with torch.no_grad():
+    model.to("hpu")
+    output = model(torch.randn(1, 10).to("hpu"))
+    print(output)
+
++ if config.measure:
++    finalize_calibration(model)
+```
+
+
+Whole script and config refer to [sample_two_steps.py](./sample_two_steps.py), [maxabs_measure.json](./maxabs_measure.json) and [maxabs_quant.json](./maxabs_quant.json).
+
+First, measure the tensor quantization statistic:
+```shell
+python sample_two_steps.py --quant_config=maxabs_measure.json
+```
+
+Then quantize the model based on previous measurements:
+```shell
+python sample_two_steps.py --quant_config=maxabs_quant.json
+```
+
+#### one step to get quantized model
+
+```diff
+import torch
++ from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration
+import habana_frameworks.torch.core as htcore
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+model = M().to("hpu")
+
++ config = FP8Config.from_json_file(args.quant_config) # args.quant_config is the path of json file
++ model = prepare(model, config)
+
+# user code run to do calibration
+with torch.no_grad():
+    output = model(torch.randn(1, 10).to("hpu"))
+    print(output)
+
++ finalize_calibration(model)
++ model = convert(model)
+
+# user code to run benchmark for quantized model
+with torch.no_grad():
+    output = model(torch.randn(1, 10).to("hpu"))
+    print(output)
+```
+
+Whole script and config refer to [sample_one_step.py](./sample_one_step.py).
+
+```shell
+python sample_one_step.py --quant_config=quant_config.json
+```
diff --git a/examples/fp8_sample/maxabs_measure.json b/examples/fp8_sample/maxabs_measure.json
new file mode 100644
index 00000000000..8d55f33e57a
--- /dev/null
+++ b/examples/fp8_sample/maxabs_measure.json
@@ -0,0 +1,7 @@
+{
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/fp8_sample/maxabs_quant.json b/examples/fp8_sample/maxabs_quant.json
new file mode 100644
index 00000000000..d1f76f8f630
--- /dev/null
+++ b/examples/fp8_sample/maxabs_quant.json
@@ -0,0 +1,8 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/fp8_sample/quant_config.json b/examples/fp8_sample/quant_config.json
new file mode 100644
index 00000000000..c139d13bbea
--- /dev/null
+++ b/examples/fp8_sample/quant_config.json
@@ -0,0 +1,8 @@
+{
+    "mode": "AUTO",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure"
+}
diff --git a/examples/fp8_sample/sample_one_step.py b/examples/fp8_sample/sample_one_step.py
new file mode 100644
index 00000000000..18eb7bfba4c
--- /dev/null
+++ b/examples/fp8_sample/sample_one_step.py
@@ -0,0 +1,57 @@
+import argparse
+import torch
+import habana_frameworks.torch.core as htcore
+htcore.hpu_set_env()
+
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+
+torch.manual_seed(1)
+
+
+# 1. python sample_one_step.py --quant_config=quant_config.json
+
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+
+def eval_func(model):
+    # user's eval func
+    input = torch.randn(1, 10)
+    model(input.to("hpu"))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Habana FP8 sample code.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--quant_config", type=str, help="json file of quantization config")
+    args = parser.parse_args()
+
+    model = M().eval().to("hpu")
+    htcore.hpu_initialize()
+
+    config = FP8Config.from_json_file(args.quant_config)
+    model = prepare(model, config)
+
+    # for calibration
+    with torch.no_grad():
+        # model.to("hpu")
+        output = model(torch.randn(1, 10).to("hpu"))
+
+    finalize_calibration(model)
+    model = convert(model)
+    print(model)
+
+    # for benchmark
+    with torch.no_grad():
+        output = model(torch.randn(1, 10).to("hpu"))
+        print(output)
diff --git a/examples/fp8_sample/sample_two_steps.py b/examples/fp8_sample/sample_two_steps.py
new file mode 100644
index 00000000000..9e17748b9b0
--- /dev/null
+++ b/examples/fp8_sample/sample_two_steps.py
@@ -0,0 +1,50 @@
+import argparse
+import torch
+import habana_frameworks.torch.core as htcore
+htcore.hpu_set_env()
+
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
+
+torch.manual_seed(1)
+
+# 1. python sample_two_steps.py --quant_config=maxabs_measure.json
+# 2. python sample_two_steps.py --quant_config=maxabs_quant.json
+
+
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5)
+        self.fc2 = torch.nn.Linear(5, 10)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        x2 = self.fc2(x1)
+        return x2
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Habana FP8 sample code.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--quant_config", type=str, help="json file of quantization config")
+    args = parser.parse_args()
+
+    model = M().eval()
+    config = FP8Config.from_json_file(args.quant_config)
+
+    if config.measure:
+        model = prepare(model, config)
+
+    if config.quantize:
+        htcore.hpu_initialize()
+        model = convert(model, config)
+        print(model)
+
+    with torch.no_grad():
+        model.to("hpu")
+        output = model(torch.randn(1, 10).to("hpu"))
+        print(output)
+
+    if config.measure:
+        finalize_calibration(model)
diff --git a/neural_compressor/torch/algorithms/habana_fp8/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/__init__.py
similarity index 70%
rename from neural_compressor/torch/algorithms/habana_fp8/__init__.py
rename to neural_compressor/torch/algorithms/fp8_quant/__init__.py
index fe3a05d7d0b..d16760b5e81 100644
--- a/neural_compressor/torch/algorithms/habana_fp8/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/__init__.py
@@ -12,5 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .fp8_quant import quantize_dynamic, quantize, white_list
-from .save_load import save, load
+from neural_compressor.torch.algorithms.fp8_quant.common import (
+    update_mode,
+    save_calib_result,
+    restore_patched_module,
+    with_patched_module,
+)
+from neural_compressor.torch.algorithms.fp8_quant.fp8_quant import FP8Quantizer
diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py
new file mode 100644
index 00000000000..b038a367a78
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/common.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import tempfile
+from collections import namedtuple
+from pathlib import Path
+from typing import Union
+
+import torch
+
+
+def save_calib_result(model):
+    import habana_quantization_toolkit as hqt
+    hqt.finish_measurements(model)
+
+
+def update_mode(config_path, measure_step=False, quant_step=False):
+    with open(config_path, 'r') as file:
+        config = json.load(file)
+
+    if (measure_step and config.get("mode") == "MEASURE") or (quant_step and config.get("mode") == "QUANTIZE"):
+        return config_path
+    else:
+        if measure_step:
+            config["mode"] = "MEASURE"
+        if quant_step:
+            config["mode"] = "QUANTIZE"
+
+        temp_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
+        temp_file_path = temp_file.name
+
+        with open(temp_file_path, 'w') as temp_file:
+            json.dump(config, temp_file)
+
+        return temp_file_path
+
+
+def generate_model_info(model):
+    mod_inst_info = namedtuple("ModInstInfo", ["name", "parent"])
+    parent_child_mod_dict = {}
+
+    def create_mod_info_recursion(parent):
+        for name, mod in parent.named_children():
+            parent_child_mod_dict[mod] = mod_inst_info(name=name, parent=parent)
+            create_mod_info_recursion(mod)
+
+    create_mod_info_recursion(model)
+    return parent_child_mod_dict
+
+
+def get_patched_mod_list():
+    from habana_quantization_toolkit._core.common import mod_default_dict
+
+    patched_mod_list = []
+    for patched_mod in mod_default_dict.values():
+        patched_mod_list.append(patched_mod.patched_module.__name__)
+    return patched_mod_list
+
+
+def restore_patched_module(patched_model):
+    from neural_compressor.torch.algorithms.fp8_quant.helper_modules import helper_mods
+    patched_mod_list = get_patched_mod_list()
+
+    parent_child_mod_dict = generate_model_info(patched_model)
+    with torch.no_grad():
+        for name, patched_mod in patched_model.named_modules():
+            patched_mod_type_str = patched_mod.__class__.__name__
+            if patched_mod_type_str in patched_mod_list:
+                parent = parent_child_mod_dict[patched_mod].parent
+                name = parent_child_mod_dict[patched_mod].name
+                class_name_org = getattr(patched_mod, "class_name_org", None) or \
+                    patched_mod.__class__.__name__.split("Patched")[-1]
+                origin_mod = helper_mods[class_name_org](patched_mod)
+                origin_mod.forward = patched_mod.forward_orig
+                setattr(parent, name, origin_mod)
+
+
+def with_patched_module(model):
+    patched_mod_list = get_patched_mod_list()
+
+    for name, mod in model.named_modules():
+        mod_type = mod.__class__.__name__
+        if mod_type in patched_mod_list:
+            return True
+    return False
diff --git a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
new file mode 100644
index 00000000000..f9ce9145569
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from neural_compressor.common.utils import FP8_QUANT
+from neural_compressor.torch.algorithms import Quantizer
+from neural_compressor.torch.algorithms.fp8_quant import (
+    restore_patched_module,
+    update_mode,
+    with_patched_module,
+)
+
+
+class FP8Quantizer(Quantizer):
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        if isinstance(quant_config, dict):
+            json_file = [cfg.json_file for cfg in quant_config.values()]
+            assert len(json_file) > 0, "Cannot get json file from config."
+            self.quant_config = json_file[0]
+
+    def prepare(self, model):
+        _prepare(model, self.quant_config)
+        return model
+
+    def convert(self, model):
+        if with_patched_module(model):
+            # for INC flow, it calls `prepare` and then `convert` user-facing API in one run
+            restore_patched_module(model)
+        _convert(model, self.quant_config)
+        return model
+
+
+def _convert(model, config_path):
+    import habana_quantization_toolkit as hqt
+
+    # update mode to QUANTIZE
+    config_path = update_mode(config_path, quant_step=True)
+
+    return hqt.prep_model(model, config_path)
+
+
+def _prepare(model, config_path):
+    import habana_quantization_toolkit as hqt
+
+    # update mode to MEASURE
+    config_path = update_mode(config_path, measure_step=True)
+
+    return hqt.prep_model(model, config_path)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
new file mode 100644
index 00000000000..6c7154328d7
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+# For mapping revert patched module to origin module
+
+helper_mods = {}
+
+def helper_mod_register(name):
+    def decorator(mod):
+        helper_mods[name] = mod
+        return mod
+    return decorator
+
+@helper_mod_register(name="Matmul")
+class Matmul(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="Linear")
+class Linear(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="FalconLinear")
+class FalconLinear(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="KVCache")
+class KVCache(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.allocate = patched_mod.org_allocate
+        self.get_shape = patched_mod.get_shape
+        self.forward = patched_mod.forward
+        self.update = patched_mod.update
+
+@helper_mod_register(name="Conv2d")
+class Conv2d(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LoRACompatibleLinear")
+class LoRACompatibleLinear(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LoRACompatibleConv")
+class LoRACompatibleConv(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="Softmax")
+class Softmax(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LinearLayer")
+class LinearLayer(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LinearAllreduce")
+class LinearAllreduce(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="ScopedLinearAllReduce")
+class ScopedLinearAllReduce(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="LmHeadLinearAllreduce")
+class LmHeadLinearAllreduce(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
+
+@helper_mod_register(name="ModuleFusedSDPA")
+class ModuleFusedSDPA(torch.nn.Module):
+    def __init__(self, patched_mod, *args, **kwargs):
+        super().__init__()
+        self.__dict__.update(patched_mod.__dict__)
+        self.extra_repr = patched_mod.extra_repr_org
diff --git a/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py b/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py
deleted file mode 100644
index c80cc443531..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import copy
-import os
-
-import habana_frameworks.torch.core as htcore
-import torch
-from deepspeed.module_inject import LinearAllreduce, LinearLayer
-from deepspeed.module_inject.layers import LmHeadLinearAllreduce
-from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const
-
-from neural_compressor.torch.utils import fetch_module, logger, set_module
-
-from .modules import (  # fp32; dynamic modules; static modules; dtype amax
-    Autocast,
-    BatchMatmul,
-    FP8BatchMatmul,
-    FP8Cast,
-    FP8DynamicBatchMatmul,
-    FP8DynamicLinear,
-    FP8DynamicMatmul,
-    FP8Linear,
-    FP8LinearAllreduce,
-    FP8LinearLayer,
-    FP8LmHeadLinearAllreduce,
-    FP8Matmul,
-    Matmul,
-)
-from .observer import observer_mapping
-
-quantization_mapping = {
-    LinearAllreduce: FP8LinearAllreduce,
-    LinearLayer: FP8LinearLayer,
-    LmHeadLinearAllreduce: FP8LmHeadLinearAllreduce,
-    torch.nn.Linear: FP8Linear,
-    BatchMatmul: FP8BatchMatmul,
-    Matmul: FP8Matmul,
-    Autocast: FP8Cast,
-    # torch.matmul: fp8_matmul
-}
-white_list = tuple(quantization_mapping.keys())
-
-
-FP8_DTYPE = [torch.float8_e5m2, torch.float8_e4m3fn, "fp8_e5m2", "fp8_e4m3"]
-dtype_mapping = {"fp8_e5m2": torch.float8_e5m2, "fp8_e4m3": torch.float8_e4m3fn}
-# enable inference optimizations
-htcore.hpu_initialize()
-
-
-def _replace_module(module, qconfig):
-    assert qconfig.w_dtype == qconfig.act_dtype, "weight and activation should be the same dtype."
-    dtype = dtype_mapping[qconfig.w_dtype]
-    # only modules that have weight should use this observer
-    if hasattr(module, "weight"):
-        observer_cls = observer_mapping[qconfig.w_observer]
-        observer_obj = observer_cls(dtype=dtype)
-    if qconfig.approach == "static":
-        if isinstance(module, white_list):
-            QModule = quantization_mapping[type(module)]
-            qmodule = QModule(module, dtype)
-    elif qconfig.approach == "dynamic":
-        if isinstance(module, torch.nn.Linear):
-            # need module for initialization
-            qmodule = FP8DynamicLinear(module, dtype)
-        elif isinstance(module, Matmul):
-            qmodule = FP8DynamicMatmul(dtype)
-        elif isinstance(module, BatchMatmul):
-            qmodule = FP8DynamicBatchMatmul(dtype)
-        elif isinstance(module, Autocast):
-            qmodule = FP8Cast(dtype=dtype)
-    # only modules that have weight should use this API
-    if hasattr(qmodule, "from_float"):
-        qmodule.from_float(module, observer_obj)
-    return qmodule
-
-
-def quantize_dynamic(model, dtype=torch.float8_e4m3fn, inplace=True):
-    torch.set_grad_enabled(False)
-    q_model = model if inplace else copy.deepcopy(model)
-    if isinstance(dtype, str):
-        dtype = dtype_mapping[dtype]
-    for n, m in q_model.named_modules():
-        if isinstance(m, torch.nn.Linear):
-            observer_cls = observer_mapping["minmax_per_channel"]
-            observer_obj = observer_cls(dtype=dtype)
-            new_m = FP8DynamicLinear(m, dtype)  # need m for init
-            new_m.from_float(m, observer_obj)
-            set_module(q_model, n, new_m)
-        elif isinstance(m, Matmul):
-            new_m = FP8DynamicMatmul(dtype)
-            set_module(q_model, n, new_m)
-        elif isinstance(m, BatchMatmul):
-            new_m = FP8DynamicBatchMatmul(dtype)
-            set_module(q_model, n, new_m)
-        elif isinstance(m, Autocast):
-            new_m = FP8Cast(dtype=dtype)
-            set_module(q_model, n, new_m)
-        htcore.mark_step()
-    _mark_params_as_const(q_model)
-    _check_params_as_const(q_model)
-    return q_model
-
-
-def _add_observer(module, qconfig):
-    act_observer = qconfig.act_observer
-
-    def input_observer_forward_pre_hook(self, input):
-        try:
-            if isinstance(input[0], torch.Tensor):
-                self.input_activation_post_process(input[0])
-            if hasattr(self, "input_activation_post_process1") and isinstance(input[1], torch.Tensor):
-                self.input_activation_post_process1(input[1])
-            return input
-        except Exception as e:
-            # The KL act_observer may encounter a overflow error on EltwiseAdd.
-            pass
-
-    ### Insert input observer into model, only for fp8_e4m3 static quantization ###
-    observer_cls = observer_mapping[act_observer]
-
-    if isinstance(module, white_list):
-        observer_obj = observer_cls(dtype=dtype_mapping[qconfig.act_dtype])
-        module.add_module("input_activation_post_process", observer_obj)
-    if isinstance(module, (BatchMatmul, Matmul)):
-        observer_obj = observer_cls(dtype=dtype_mapping[qconfig.act_dtype])
-        module.add_module("input_activation_post_process1", observer_obj)
-    module.register_forward_pre_hook(input_observer_forward_pre_hook)
-
-
-def _remove_observer(module):
-    import deepspeed.comm as dist
-    from torch.distributed import ReduceOp
-
-    if hasattr(module, "input_activation_post_process"):
-        scale = module.input_activation_post_process.calculate_qparams()
-        if dist.is_initialized():
-            scale = scale.to("hpu")
-            dist.all_reduce(scale, op=ReduceOp.MAX)
-        if hasattr(module, "input_activation_post_process1"):
-            module.register_parameter("scale1", torch.nn.Parameter(scale))
-        else:
-            module.register_parameter("scale", torch.nn.Parameter(scale))
-        delattr(module, "input_activation_post_process")
-    if hasattr(module, "input_activation_post_process1"):
-        scale = module.input_activation_post_process1.calculate_qparams()
-        if dist.is_initialized():
-            scale = scale.to("hpu")
-            dist.all_reduce(scale, op=ReduceOp.MAX)
-        module.register_parameter("scale2", torch.nn.Parameter(scale))
-        delattr(module, "input_activation_post_process1")
-
-    # remove observer hooks
-    hook_map = module._forward_pre_hooks
-    handle_ids_to_remove = set()
-    for handle_id, hook_fn in hook_map.items():
-        if hasattr(hook_fn, "__name__") and hook_fn.__name__ == "input_observer_forward_pre_hook":
-            handle_ids_to_remove.add(handle_id)
-    for handle_id in handle_ids_to_remove:
-        hook_map.pop(handle_id)
-
-
-def prepare(model, qconfig_mapping):
-    model.qconfig = qconfig_mapping
-    for (op_name, op_type), qconfig in qconfig_mapping.items():
-        if qconfig.approach == "dynamic":
-            continue
-        if qconfig.w_dtype not in FP8_DTYPE:
-            continue
-        module = fetch_module(model, op_name)
-        if module is None:
-            logger.info(f"{op_name} is not found in model.")
-            continue
-        _add_observer(module, qconfig)
-        set_module(model, op_name, module)
-    return model
-
-
-def convert(model):
-    for (op_name, op_type), qconfig in model.qconfig.items():
-        if qconfig.w_dtype not in FP8_DTYPE:
-            continue
-        module = fetch_module(model, op_name)
-        if module is None:
-            logger.info(f"{op_name} is not found in model.")
-            continue
-        if qconfig.approach != "dynamic":
-            _remove_observer(module)
-        module = _replace_module(module, qconfig)
-        set_module(model, op_name, module)
-        htcore.mark_step()
-    return model
-
-
-def quantize(model, qconfig_mapping, run_fn=None, run_args=None, inplace=True):
-    torch.set_grad_enabled(False)
-    q_model = model if inplace else copy.deepcopy(model)
-    q_model = prepare(q_model, qconfig_mapping)
-    if run_fn is not None:
-        if run_args is not None:
-            run_fn(q_model, *run_args)
-        else:
-            run_fn(q_model)
-    q_model = convert(q_model)
-    _mark_params_as_const(q_model)
-    _check_params_as_const(q_model)
-    return q_model
diff --git a/neural_compressor/torch/algorithms/habana_fp8/modules.py b/neural_compressor/torch/algorithms/habana_fp8/modules.py
deleted file mode 100644
index 99b9faf1f72..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/modules.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import os
-
-import habana_frameworks.torch.core as htcore
-import habana_frameworks.torch.hpex
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-
-from neural_compressor.common import logger
-
-from .observer import calculate_qparams
-
-
-##################### FP32 modules #######################
-class Matmul(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-class BatchMatmul(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, y):
-        return torch.bmm(x, y)
-
-
-class Autocast(nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x):
-        return x
-
-
-##################### FP8 modules #######################
-class FP8DynamicLinear(torch.nn.Module):
-    def __init__(self, org_module, dtype=torch.float8_e4m3fn) -> None:
-        super().__init__()
-        # attributes
-        self.use_amax = True
-        self.dtype = dtype
-        self.in_features = org_module.in_features
-        self.out_features = org_module.out_features
-        self.weight_dtype = self.dtype
-        self.out_dtype = org_module.weight.dtype
-        # register weight, bias
-        self.register_buffer(
-            "weight",
-            torch.empty(
-                self.in_features,
-                self.out_features,
-                device="hpu",
-                dtype=self.weight_dtype,
-            ),
-        )
-        if org_module.bias is not None:
-            self.register_buffer(
-                "bias",
-                torch.empty(
-                    self.out_features,
-                    device="hpu",
-                    dtype=self.out_dtype,
-                ),
-            )
-        else:
-            self.bias = None
-
-    def from_float(self, org_module, w_observer):
-        # register scale
-        if not org_module.weight.device.type == "meta":
-            w_observer(org_module.weight)
-            weight_scale = w_observer.calculate_qparams()
-        else:
-            weight_scale = torch.tensor([1.0])
-        self.register_buffer(
-            "weight_scale",
-            torch.tensor(
-                weight_scale,
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        self.register_buffer(
-            "weight_scale_inv",
-            torch.tensor(
-                torch.reciprocal(weight_scale),
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        # copy weight and bias
-        if not org_module.weight.device.type == "meta":
-            org_module.to("hpu")
-            self.weight.data.copy_(
-                torch.ops.hpu.cast_to_fp8_v2(org_module.weight.T, self.weight_scale_inv, False, False, self.dtype)[0]
-            )
-            if org_module.bias is not None:
-                self.bias.data.copy_(org_module.bias.data.type(self.out_dtype))
-
-    def forward(self, inp):
-        assert inp.shape[-1] == self.in_features, "GEMM not possible"
-        org_middle_shape = inp.shape[1:-1]
-        inp = inp.view(-1, self.in_features)
-        if inp.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            if self.use_amax:
-                input_scale = calculate_qparams(inp.min(), inp.max(), self.dtype)
-                input_scale_inv = torch.reciprocal(input_scale)
-            else:
-                input_scale, input_scale_inv = None, None
-            inp = torch.ops.hpu.cast_to_fp8_v2(inp, input_scale_inv, False, False, self.dtype)[0]
-        else:
-            input_scale, input_scale_inv = None, None
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inp,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            input_scale,  # inv is used for recover scale
-            self.weight_scale,
-            self.bias,
-            False,
-        )
-        out = out.view(-1, *org_middle_shape, out.shape[-1])
-        return out
-
-    def extra_repr(self) -> str:
-        return "in_features={}, out_features={}, bias={}, format={}".format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None,
-            self.dtype,
-        )
-
-
-class FP8DynamicMatmul(torch.nn.Module):
-    def __init__(self, dtype) -> None:
-        super().__init__()
-        self.dtype = dtype
-        self.use_amax = True
-        self.out_dtype = torch.float32
-
-    def forward(self, input1, input2):
-        dim1 = input1.shape[-1]
-        dim2 = input2.shape[-2]
-        assert dim1 == dim2, "GEMM not possible"
-
-        # process input1
-        if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input1.dtype
-            if self.use_amax:
-                input1_scale = calculate_qparams(input1.min(), input1.max(), self.dtype)
-                input1_scale_inv = torch.reciprocal(input1_scale)
-            else:
-                input1_scale, input1_scale_inv = None, None
-            input1 = torch.ops.hpu.cast_to_fp8_v2(input1, input1_scale_inv, False, False, self.dtype)[0]
-        else:
-            # skip cast for input1
-            input1_scale, input1_scale_inv = None, None
-        # process input2
-        if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input2.dtype
-            if self.use_amax:
-                input2_scale = calculate_qparams(input2.min(), input2.max(), self.dtype)
-                input2_scale_inv = torch.reciprocal(input2_scale)
-            else:
-                input2_scale, input2_scale_inv = None, None
-            input2 = torch.ops.hpu.cast_to_fp8_v2(input2, input2_scale_inv, False, False, self.dtype)[0]
-        else:
-            # skip cast for input2
-            input2_scale, input2_scale_inv = None, None
-        # calculate
-        out = torch.ops.hpu.fp8_gemm_v2(
-            input1,
-            False,
-            input2,
-            False,
-            None,
-            self.out_dtype,
-            input1_scale,  # inv is used for recover scale
-            input2_scale,
-            None,
-            False,
-        )
-        return out
-
-    def extra_repr(self) -> str:
-        return "format={}".format(self.dtype)
-
-
-class FP8DynamicBatchMatmul(FP8DynamicMatmul):
-    pass
-
-
-class FP8Linear(torch.nn.Module):
-    def __init__(self, org_module, dtype) -> None:
-        super().__init__()
-        # attributes
-        self.in_features = org_module.in_features
-        self.out_features = org_module.out_features
-        self.dtype = dtype
-        self.weight_dtype = self.dtype
-        self.out_dtype = org_module.weight.dtype
-        self.register_buffer(
-            "weight",
-            torch.empty(
-                self.in_features,
-                self.out_features,
-                device="hpu",
-                dtype=self.weight_dtype,
-            ),
-        )
-        if org_module.bias is not None:
-            self.register_buffer(
-                "bias",
-                torch.empty(
-                    self.out_features,
-                    device="hpu",
-                    dtype=self.out_dtype,
-                ),
-            )
-        else:
-            self.bias = None
-
-    def from_float(self, org_module, w_observer):
-        # register scale
-        if not org_module.weight.device.type == "meta":
-            w_observer(org_module.weight)
-            weight_scale = w_observer.calculate_qparams()
-        else:
-            weight_scale = torch.tensor([1.0])
-        self.register_buffer(
-            "weight_scale",
-            torch.tensor(
-                weight_scale,
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        self.register_buffer(
-            "weight_scale_inv",
-            torch.tensor(
-                torch.reciprocal(weight_scale),
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        # copy weight and bias
-        if not org_module.weight.device.type == "meta":
-            org_module.to("hpu")
-            self.weight.data.copy_(
-                torch.ops.hpu.cast_to_fp8_v2(org_module.weight.T, self.weight_scale_inv, False, False, self.dtype)[0]
-            )
-            if org_module.bias is not None:
-                self.bias.data.copy_(org_module.bias.data.type(self.out_dtype))
-        # register input scale
-        input_scale = org_module.scale if hasattr(org_module, "scale") else torch.tensor([1.0])
-        self.register_buffer(
-            "input_scale",
-            torch.tensor(
-                input_scale,
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-        self.register_buffer(
-            "input_scale_inv",
-            torch.tensor(
-                torch.reciprocal(input_scale),
-                device="hpu",
-                dtype=torch.float32,
-            ),
-        )
-
-    def forward(self, inp):
-        assert inp.shape[-1] == self.in_features, "GEMM not possible"
-        org_middle_shape = inp.shape[1:-1]
-        inp = inp.view(-1, self.in_features)
-        inp = torch.ops.hpu.cast_to_fp8_v2(inp, self.input_scale_inv, False, False, self.dtype)[0]
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inp,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            self.input_scale,  # inv is used for recover scale
-            self.weight_scale,
-            self.bias,
-            False,
-        )
-        out = out.view(-1, *org_middle_shape, out.shape[-1])
-        return out
-
-    def extra_repr(self) -> str:
-        return "in_features={}, out_features={}, bias={}, scale={}, format={}".format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None,
-            self.input_scale.tolist() if hasattr(self, "input_scale") else None,
-            self.dtype,
-        )
-
-
-class FP8Matmul(torch.nn.Module):
-    def __init__(self, org_module, dtype) -> None:
-        super().__init__()
-        org_module.to("hpu")
-        self.dtype = dtype
-        self.out_dtype = torch.float32
-        scale1 = org_module.scale1 if hasattr(org_module, "scale1") else 1.0
-        scale2 = org_module.scale2 if hasattr(org_module, "scale2") else 1.0
-        self.register_buffer(
-            "scale1",
-            torch.tensor(
-                scale1,
-                device="hpu",
-                dtype=self.out_dtype,
-            ),
-        )
-        self.register_buffer(
-            "scale2",
-            torch.tensor(
-                scale2,
-                device="hpu",
-                dtype=self.out_dtype,
-            ),
-        )
-
-    def forward(self, input1, input2):
-        dim1 = input1.shape[-1]
-        dim2 = input2.shape[-2]
-        assert dim1 == dim2, "GEMM not possible"
-
-        if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input1.dtype
-            self.scale1_inv = torch.reciprocal(self.scale1)
-            input1 = torch.ops.hpu.cast_to_fp8_v2(input1, self.scale1_inv, False, False, self.dtype)[0]
-        else:
-            self.scale1_inv = None
-        if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            self.out_dtype = input2.dtype
-            self.scale2_inv = torch.reciprocal(self.scale2)
-            input2 = torch.ops.hpu.cast_to_fp8_v2(input2, self.scale2_inv, False, False, self.dtype)[0]
-        else:
-            self.scale2_inv = None
-        out = torch.ops.hpu.fp8_gemm_v2(
-            input1,
-            False,
-            input2,
-            False,
-            None,
-            self.out_dtype,
-            self.scale1,  # inv is used for recover scale
-            self.scale2,
-            None,
-            False,
-        )
-        return out
-
-    def extra_repr(self) -> str:
-        return "scales={}, format={}".format(
-            (self.scale1.tolist(), self.scale2.tolist()),
-            self.dtype,
-        )
-
-
-class FP8BatchMatmul(FP8Matmul):
-    pass
-
-
-class FP8Cast(torch.nn.Module):
-    def __init__(self, org_module=None, dtype=torch.float8_e4m3fn) -> None:
-        super().__init__()
-        self.dtype = dtype
-        if org_module is not None:
-            org_module.to("hpu")
-            scale = org_module.scale if hasattr(org_module, "scale") else 1.0
-            self.register_buffer(
-                "scale",
-                torch.tensor(
-                    scale,
-                    device="hpu",
-                    dtype=torch.float32,
-                ),
-            )
-            self.scale, self.scale_inv = None, None  # due to next matmul doesn't know this scale
-        else:
-            self.scale, self.scale_inv = None, None
-
-    def forward(self, input):
-        if input.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            out = torch.ops.hpu.cast_to_fp8_v2(input, self.scale_inv, False, False, self.dtype)[0]
-        else:
-            out = input
-        return out
-
-    def extra_repr(self) -> str:
-        return "scales={}, format={}".format(
-            self.scale,
-            self.dtype,
-        )
-
-
-FP8LinearLayer = FP8Linear
-
-
-class FP8LinearAllreduce(FP8Linear):
-    def forward(self, inp):
-        assert inp.shape[-1] == self.in_features, "GEMM not possible"
-        inputmat = inp.view(-1, self.in_features)
-        inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.input_scale_inv, False, False, self.dtype)[0]
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inputmat,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            self.input_scale,
-            self.weight_scale,
-            None,
-            False,
-        )
-        from deepspeed import comm as dist
-
-        if self.mp_group is not None:
-            dist.inference_all_reduce(out, group=self.mp_group)
-        if self.bias is not None:
-            out += self.bias
-        return out.view(-1, *inp.shape[1:-1], out.shape[-1])
-
-
-class FP8LmHeadLinearAllreduce(FP8Linear):
-    def forward(self, inp):
-        # from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
-        # input_shard_size = get_shard_size(inp.shape[-1], self.world_size)
-        # input_shard_offset = sum(get_shard_size_list(inp.shape[-1], self.world_size)[0:self.rank])
-
-        # inputmat = inp[:, :, input_shard_offset:input_shard_offset + input_shard_size]
-        assert (
-            inp.shape[-1] % self.world_size == 0
-        ), "Please ensure that self.world_size is divisible by input.shape[-1]"
-        input_shard = inp.shape[-1] // self.world_size
-        inp_part = inp[:, :, self.rank * input_shard : (self.rank + 1) * input_shard]
-        inputmat = inp_part.view(-1, input_shard)  # dim=2 will help kernel speed
-        inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.input_scale_inv, False, False, self.dtype)[0]
-        out = torch.ops.hpu.fp8_gemm_v2(
-            inputmat,
-            False,
-            self.weight,
-            False,
-            None,
-            self.out_dtype,
-            self.input_scale,
-            self.weight_scale,
-            None,
-            False,
-        )
-        from deepspeed import comm as dist
-
-        if self.mp_group is not None:
-            dist.inference_all_reduce(out, group=self.mp_group)
-        if self.bias is not None:
-            out += self.bias
-        return out.view(-1, *inp.shape[1:-1], out.shape[-1])
diff --git a/neural_compressor/torch/algorithms/habana_fp8/observer.py b/neural_compressor/torch/algorithms/habana_fp8/observer.py
deleted file mode 100644
index fd29892ddb7..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/observer.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import os
-from typing import Tuple
-
-import habana_frameworks.torch.core as htcore
-import torch
-from torch.ao.quantization.observer import *
-
-E4M3_AMAX = torch.tensor(240, dtype=torch.float).to("cpu")
-E5M2_AMAX = torch.tensor(57344, dtype=torch.float).to("cpu")
-USE_HW_SCALE = bool(os.getenv("USE_HW_SCALE", False))
-USE_POW2_SCALE = bool(os.getenv("USE_POW2_SCALE", False))
-observer_mapping = {}
-
-
-def observer_registry(name):
-    def new_observer(observer_cls):
-        global observer_mapping
-        observer_mapping[name] = observer_cls
-        return observer_cls
-
-    return new_observer
-
-
-def _map_gaudi_scale(scale):
-    if USE_HW_SCALE:
-        scale_list = torch.tensor([16, 1, 1 / 16, 1 / 256])
-        return torch.clip(
-            2 ** (torch.ceil(torch.log2(scale) / 4) * 4),
-            torch.tensor(scale_list[-1], dtype=scale.dtype, device=scale.device),
-            torch.tensor(scale_list[0], dtype=scale.dtype, device=scale.device),
-        )
-    elif USE_POW2_SCALE:
-        return 2 ** torch.ceil(torch.log2(scale))
-    else:
-        return scale
-
-
-def calculate_qparams(min_val, max_val, dtype):
-    amax = torch.max(torch.abs(min_val), torch.abs(max_val))
-    dtype_amax = E4M3_AMAX if dtype == torch.float8_e4m3fn else E5M2_AMAX
-    scale = amax / dtype_amax
-    scale = scale.reshape(-1)
-    return _map_gaudi_scale(scale)
-
-
-@observer_registry(name="minmax")
-class FP8MinMaxObserver(ObserverBase):
-    def __init__(
-        self,
-        dtype: torch.dtype = torch.float8_e4m3fn,
-    ) -> None:
-        # bins: The number of bins used for histogram calculation.
-        super().__init__(dtype=dtype)
-        assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype."
-        factory_kwargs = {"device": "cpu", "dtype": torch.float32}
-        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
-        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
-
-    def forward(self, x_orig):
-        r"""Records the running minimum and maximum of ``x``."""
-        if x_orig.numel() == 0:
-            return x_orig
-        x = x_orig.detach()
-        x = x.to(self.min_val.dtype)
-        min_val_cur, max_val_cur = torch.aminmax(x)
-        min_val = torch.min(min_val_cur, self.min_val)
-        max_val = torch.max(max_val_cur, self.max_val)
-        self.min_val.copy_(min_val)
-        self.max_val.copy_(max_val)
-        return x_orig
-
-    def calculate_qparams(self):
-        r"""Calculates the quantization parameters."""
-        scale = calculate_qparams(self.min_val, self.max_val, self.dtype)
-        return scale
-
-    def extra_repr(self):
-        return f"min_val={self.min_val}, max_val={self.max_val}"
-
-    def reset_min_max_vals(self):
-        """Resets the min/max values."""
-        self.min_val.copy_(torch.tensor(float("inf")))
-        self.max_val.copy_(torch.tensor(float("-inf")))
-
-
-@observer_registry(name="minmax_per_channel")
-class FP8PerChannelMinMaxObserver(ObserverBase):
-    def __init__(
-        self,
-        dtype: torch.dtype = torch.float8_e4m3fn,
-        ch_axis=0,  # weight_shape = (out_features, in_features)
-    ) -> None:
-        # bins: The number of bins used for histogram calculation.
-        super().__init__(dtype=dtype)
-        assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype."
-        self.ch_axis = ch_axis
-        factory_kwargs = {"device": "cpu", "dtype": torch.float32}
-        self.register_buffer("min_val", torch.tensor([], **factory_kwargs))
-        self.register_buffer("max_val", torch.tensor([], **factory_kwargs))
-
-    def forward(self, x_orig):
-        if x_orig.numel() == 0:
-            return x_orig
-        x = x_orig.detach()
-        min_val = self.min_val
-        max_val = self.max_val
-        x_dim = x.size()
-
-        new_axis_list = [i for i in range(len(x_dim))]
-        new_axis_list[self.ch_axis] = 0
-        new_axis_list[0] = self.ch_axis
-        y = x.permute(new_axis_list)
-        # Need to match dtype of min/max because the updates to buffers
-        # are done in place and types need to match for comparisons
-        y = y.to(self.min_val.dtype)
-        y = torch.flatten(y, start_dim=1)
-        if min_val.numel() == 0 or max_val.numel() == 0:
-            min_val, max_val = torch.aminmax(y, dim=1)
-        else:
-            min_val_cur, max_val_cur = torch.aminmax(y, dim=1)
-            min_val = torch.min(min_val_cur, min_val)
-            max_val = torch.max(max_val_cur, max_val)
-        self.min_val.resize_(min_val.shape)
-        self.max_val.resize_(max_val.shape)
-        self.min_val.copy_(min_val)
-        self.max_val.copy_(max_val)
-        return x_orig
-
-    def calculate_qparams(self):
-        r"""Calculates the quantization parameters."""
-        scale = calculate_qparams(self.min_val, self.max_val, self.dtype)
-        return scale
-
-    def extra_repr(self):
-        return f"min_val={self.min_val}, max_val={self.max_val}"
-
-    def reset_min_max_vals(self):
-        """Resets the min/max values."""
-        self.min_val.copy_(torch.tensor(float("inf")))
-        self.max_val.copy_(torch.tensor(float("-inf")))
-
-
-@observer_registry(name="kl")
-class FP8HistogramObserver(ObserverBase):
-    def __init__(
-        self,
-        dtype: torch.dtype = torch.float8_e4m3fn,
-        bins: int = 2048,
-        upsample_rate: int = 128,
-        qscheme=torch.per_tensor_affine,
-        eps=torch.finfo(torch.float32).eps,
-    ) -> None:
-        # bins: The number of bins used for histogram calculation.
-        super().__init__(dtype=dtype)
-        assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype."
-        self.bins = bins
-        factory_kwargs = {"device": "cpu", "dtype": torch.float32}
-        self.register_buffer("histogram", torch.zeros(self.bins, **factory_kwargs))
-        self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs))
-        self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs))
-        self.dst_nbins = 2 ** torch.finfo(self.dtype).bits
-        self.upsample_rate = upsample_rate
-
-    def calculate_qparams(self, **kwargs):
-        new_min, new_max = self._non_linear_param_search()
-        scale = calculate_qparams(new_min, new_max, self.dtype)
-        return scale
-
-    def _get_norm(self, delta_begin: torch.Tensor, delta_end: torch.Tensor, density: torch.Tensor) -> torch.Tensor:
-        r"""Compute the norm of the values uniformaly distributed between
-        delta_begin and delta_end.
-        Currently only L2 norm is supported.
-
-        norm = density * (integral_{begin, end} x^2)
-             = density * (end^3 - begin^3) / 3
-        """
-        norm = (delta_end * delta_end * delta_end - delta_begin * delta_begin * delta_begin) / 3
-        return density * norm
-
-    def _get_dst_bin(self, src_bin_begin, src_bin_end, dst_bin_max):
-        # get dst bin value
-        FP8_amax = E4M3_AMAX if self.dtype == torch.float8_e4m3fn else E5M2_AMAX
-        scale = FP8_amax / dst_bin_max
-        if torch.isinf(torch.tensor(scale)):
-            scale = torch.tensor(3.4e38)
-        tmp = torch.ops.hpu.cast_to_fp8_v2(src_bin_begin.to("hpu"), scale.to("hpu"), False, False, self.dtype)[0]
-        dst_bin_begin = torch.ops.hpu.cast_from_fp8(tmp, None, torch.float32).to("cpu")
-        tmp = torch.ops.hpu.cast_to_fp8_v2(src_bin_end.to("hpu"), scale.to("hpu"), False, False, self.dtype)[0]
-        dst_bin_end = torch.ops.hpu.cast_from_fp8(tmp, None, torch.float32).to("cpu")
-        # get bin width of dst bin value, dst_bin_begin must contain 0 and the max qvalue.
-        dst_bin = list(set(dst_bin_begin.detach().cpu().numpy()))
-        dst_bin.sort()
-        width_dict = {}
-        bin_of_dst_dict = {}
-        for i, bin in enumerate(dst_bin):
-            bin_of_dst_dict[bin] = i
-            if bin == 0:
-                width_dict[bin] = {"left": 0, "right": dst_bin[i + 1]}
-            elif i == len(dst_bin) - 1:
-                width_dict[bin] = {"left": dst_bin[i] - dst_bin[i - 1], "right": dst_bin[i] - dst_bin[i - 1]}
-            else:
-                width_dict[bin] = {"left": dst_bin[i] - dst_bin[i - 1], "right": dst_bin[i + 1] - dst_bin[i]}
-        dst_bin_of_begin = [bin_of_dst_dict[float(i)] for i in dst_bin_begin]
-        dst_bin_of_end = [bin_of_dst_dict[float(i)] for i in dst_bin_end]
-        left_dst_bin_end_width = [width_dict[float(i)]["left"] for i in dst_bin_end]
-        right_dst_bin_begin_width = [width_dict[float(i)]["right"] for i in dst_bin_begin]
-        return (
-            dst_bin_begin,
-            dst_bin_end,
-            torch.tensor(dst_bin_of_begin),
-            torch.tensor(dst_bin_of_end),
-            torch.tensor(left_dst_bin_end_width),
-            torch.tensor(right_dst_bin_begin_width),
-        )
-
-    def _compute_quantization_error(self, next_start_bin: int, next_end_bin: int):
-        r"""Compute the quantization error if we use start_bin to end_bin as the
-        min and max to do the quantization."""
-        bin_width = (self.max_val.item() - self.min_val.item()) / self.bins
-        dst_bin_max = bin_width * (next_end_bin - next_start_bin + 1)
-
-        src_bin = torch.arange(self.bins, device=self.histogram.device)
-        src_bin_begin = src_bin * bin_width
-        src_bin_end = src_bin_begin + bin_width
-        (
-            dst_bin_begin,
-            dst_bin_end,
-            dst_bin_of_begin,
-            dst_bin_of_end,
-            left_dst_bin_end_width,
-            right_dst_bin_begin_width,
-        ) = self._get_dst_bin(src_bin_begin, src_bin_end, dst_bin_max)
-
-        dst_bin_of_begin_center = dst_bin_begin + right_dst_bin_begin_width
-        dst_bin_of_end_center = dst_bin_end + left_dst_bin_end_width
-
-        density = self.histogram / bin_width
-
-        norm = torch.zeros(self.bins, device=self.histogram.device)
-
-        delta_begin = src_bin_begin - dst_bin_of_begin_center
-        delta_end = right_dst_bin_begin_width
-
-        norm += self._get_norm(delta_begin, delta_end, density)
-
-        norm += (dst_bin_of_end - dst_bin_of_begin - 1) * self._get_norm(
-            torch.tensor(-left_dst_bin_end_width), torch.tensor(right_dst_bin_begin_width), density
-        )
-
-        delta_begin = -left_dst_bin_end_width
-        delta_end = src_bin_end - dst_bin_of_end_center
-        norm += self._get_norm(delta_begin, delta_end, density)
-
-        return norm.sum().item()
-
-    def _non_linear_param_search(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        r"""Non-linear parameter search.
-
-        An approximation for L2 error minimization for selecting min/max.
-        By selecting new min/max, we filter out outliers in input distribution.
-        This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in
-        caffe2/quantization/server/norm_minimization.cc
-        """
-        assert self.histogram.size()[0] == self.bins, "bins mismatch"
-        bin_width = (self.max_val - self.min_val) / self.bins
-
-        # cumulative sum
-        total = torch.sum(self.histogram).item()
-        cSum = torch.cumsum(self.histogram, dim=0)
-
-        stepsize = 1e-5  # granularity
-        alpha = 0.0  # lower bound
-        beta = 1.0  # upper bound
-        start_bin = 0
-        end_bin = self.bins - 1
-        norm_min = float("inf")
-
-        while alpha < beta:
-            # Find the next step
-            next_alpha = alpha
-            next_beta = beta - stepsize
-
-            # find the right bins between the quantile bounds
-            # keep the left bins at zero due to fp8 symmetry
-            l = 0
-            r = end_bin
-            while r > start_bin and cSum[r] > next_beta * total:
-                r = r - 1
-
-            # decide the next move
-            next_start_bin = start_bin
-            next_end_bin = end_bin
-            if (l - start_bin) <= (end_bin - r):
-                # move the end bin
-                next_end_bin = r
-                beta = next_beta
-
-            if next_start_bin == start_bin and next_end_bin == end_bin:
-                continue
-
-            # calculate the quantization error using next_start_bin and next_end_bin
-            norm = self._compute_quantization_error(next_start_bin, next_end_bin)
-
-            if norm > norm_min:
-                break
-            norm_min = norm
-            start_bin = next_start_bin
-            end_bin = next_end_bin
-
-        new_min = self.min_val + bin_width * start_bin
-        new_max = self.min_val + bin_width * (end_bin + 1)
-        return new_min, new_max
-
-    def _adjust_min_max(
-        self, combined_min: torch.Tensor, combined_max: torch.Tensor, upsample_rate: int
-    ) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
-        # We ensure that:
-        # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins)
-        # This allows us to have a common grid of resolution s, where we can align
-        # the input histogram
-        # start_idx maps min_val to the histogram bin index.
-
-        # Compute the width of histogram bins is a straightforward solution, where
-        # hist_bin_width = (self.max_val - self.min_val) / (self.bins * upsample_rate)
-        # Underflow happens if the numerator is close to the smallest positive subnormal number of FP32
-        # Therefore, we avoid such division operation.
-        downsample_rate = int(
-            torch.ceil((combined_max - combined_min) * upsample_rate / (self.max_val - self.min_val)).item()
-        )
-        e = downsample_rate * (self.max_val - self.min_val) / upsample_rate - (combined_max - combined_min)
-        start_idx = int(
-            torch.round(
-                (self.min_val - combined_min) * self.bins * upsample_rate / (self.max_val - self.min_val)
-            ).item()
-        )
-        combined_max = combined_max + e
-        combined_min = combined_min
-        return combined_min, combined_max, downsample_rate, start_idx
-
-    def _combine_histograms(
-        self,
-        orig_hist: torch.Tensor,
-        new_hist: torch.Tensor,
-        upsample_rate: int,
-        downsample_rate: int,
-        start_idx: int,
-        Nbins: int,
-    ) -> torch.Tensor:
-        # First up-sample the histogram with new data by a factor of L
-        # This creates an approximate probability density that's piecewise constant
-        upsampled_histogram = new_hist.repeat_interleave(upsample_rate)
-        # Now insert the upsampled histogram into the output
-        # histogram, which is initialized with zeros.
-        # The offset at which the histogram is introduced is determined
-        # by the start index as the output histogram can cover a wider range
-        histogram_with_output_range = torch.zeros((Nbins * downsample_rate), device=orig_hist.device)
-        histogram_with_output_range[start_idx : Nbins * upsample_rate + start_idx] = upsampled_histogram
-        # Compute integral histogram, double precision is needed to ensure
-        # that there are no overflows
-        integral_histogram = torch.cumsum(histogram_with_output_range, 0, dtype=torch.double)[
-            downsample_rate - 1 :: downsample_rate
-        ]
-        # Finally perform interpolation
-        shifted_integral_histogram = torch.zeros((Nbins), device=orig_hist.device)
-        shifted_integral_histogram[1:Nbins] = integral_histogram[0:-1]
-        interpolated_histogram = (integral_histogram - shifted_integral_histogram) / upsample_rate
-        orig_hist = orig_hist + interpolated_histogram.to(torch.float)
-        return orig_hist
-
-    def forward(self, x_orig: torch.Tensor) -> torch.Tensor:
-        if x_orig.numel() == 0:
-            return x_orig
-        x = x_orig.detach()
-        # use abs due to fp8 symmetry
-        x = torch.abs(x)
-        min_val = self.min_val
-        max_val = self.max_val
-        same_values = min_val.item() == max_val.item()
-        is_uninitialized = min_val == float("inf") and max_val == float("-inf")
-        if is_uninitialized or same_values:
-            min_val, max_val = torch.aminmax(x)
-            self.min_val.resize_(min_val.shape)
-            self.min_val.copy_(min_val)
-            self.max_val.resize_(max_val.shape)
-            self.max_val.copy_(max_val)
-            assert min_val.numel() == 1 and max_val.numel() == 1, "histogram min/max values must be scalar."
-            torch.histc(x, self.bins, min=int(min_val), max=int(max_val), out=self.histogram)
-        else:
-            new_min, new_max = torch.aminmax(x)
-            combined_min = torch.min(new_min, min_val)
-            combined_max = torch.max(new_max, max_val)
-            # combine the existing histogram and new histogram into 1 histogram
-            # We do this by first upsampling the histogram to a dense grid
-            # and then downsampling the histogram efficiently
-            (
-                combined_min,
-                combined_max,
-                downsample_rate,
-                start_idx,
-            ) = self._adjust_min_max(combined_min, combined_max, self.upsample_rate)
-            assert combined_min.numel() == 1 and combined_max.numel() == 1, "histogram min/max values must be scalar."
-            combined_histogram = torch.histc(x, self.bins, min=int(combined_min), max=int(combined_max))
-            if combined_min == min_val and combined_max == max_val:
-                combined_histogram += self.histogram
-            else:
-                combined_histogram = self._combine_histograms(
-                    combined_histogram,
-                    self.histogram,
-                    self.upsample_rate,
-                    downsample_rate,
-                    start_idx,
-                    self.bins,
-                )
-
-            self.histogram.detach_().resize_(combined_histogram.shape)
-            self.histogram.copy_(combined_histogram)
-            self.min_val.detach_().resize_(combined_min.shape)
-            self.min_val.copy_(combined_min)
-            self.max_val.detach_().resize_(combined_max.shape)
-            self.max_val.copy_(combined_max)
-        return x_orig
-
-    def extra_repr(self):
-        return f"min_val={self.min_val}, max_val={self.max_val}"
diff --git a/neural_compressor/torch/algorithms/habana_fp8/save_load.py b/neural_compressor/torch/algorithms/habana_fp8/save_load.py
deleted file mode 100644
index 8079a130625..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/save_load.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import json
-import os
-
-import habana_frameworks.torch.core as htcore
-import torch
-
-from neural_compressor.common.utils import load_config_mapping, save_config_mapping
-from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger
-
-from .fp8_quant import FP8_DTYPE, dtype_mapping
-from .modules import (  # fp32; dynamic modules
-    Autocast,
-    BatchMatmul,
-    FP8Cast,
-    FP8DynamicBatchMatmul,
-    FP8DynamicLinear,
-    FP8DynamicMatmul,
-    Matmul,
-)
-from .observer import observer_mapping
-
-
-def save(model, output_dir="./saved_results"):
-    if not os.path.exists(output_dir):
-        os.mkdir(output_dir)
-    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
-    qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME)
-    # saving process
-    save_config_mapping(model.qconfig, qconfig_file_path)
-
-    import fp8_convert
-
-    stat_dict = {}
-    for k, v in model.state_dict().items():
-        if v.dtype in FP8_DTYPE:
-            v = fp8_convert.to_u8(v.to("cpu"))
-        stat_dict[k] = v.to("cpu")
-    torch.save(stat_dict, qmodel_file_path)
-
-    logger.info("Save state_dict of quantized model to {}.".format(qmodel_file_path))
-    logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
-
-
-def load(model, output_dir="./saved_results"):
-    from neural_compressor.torch.utils import fetch_module, set_module
-
-    from .fp8_quant import quantization_mapping, white_list
-
-    qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME)
-    stat_dict = torch.load(qmodel_file_path)
-    import fp8_convert
-
-    for (op_name, op_type), op_qconfig in model.qconfig.items():
-        dtype = dtype_mapping[op_qconfig.w_dtype]
-        # only modules that have weight should use this observer
-        observer_cls = observer_mapping[op_qconfig.w_observer]
-        observer_obj = observer_cls(dtype=dtype)
-        choice = 1 if dtype == torch.float8_e4m3fn else 0
-        if op_name + ".weight" in stat_dict:
-            stat_dict[op_name + ".weight"] = fp8_convert.from_u8(stat_dict[op_name + ".weight"], choice)
-        if dtype not in FP8_DTYPE:
-            continue
-        module = fetch_module(model, op_name)
-        # replace module
-        if op_qconfig.approach == "static":
-            if isinstance(module, white_list):
-                QModule = quantization_mapping[type(module)]
-                qmodule = QModule(module, dtype)
-        else:
-            if isinstance(module, torch.nn.Linear):
-                # need module for initialization
-                qmodule = FP8DynamicLinear(module, dtype)
-            elif isinstance(module, Matmul):
-                qmodule = FP8DynamicMatmul(dtype)
-            elif isinstance(module, BatchMatmul):
-                qmodule = FP8DynamicBatchMatmul(dtype)
-            elif isinstance(module, Autocast):
-                qmodule = FP8Cast(dtype=dtype)
-        # only modules that have weight should use this API
-        if hasattr(qmodule, "from_float"):
-            qmodule.from_float(module, observer_obj)
-        # replace module with qmodule
-        set_module(model, op_name, qmodule)
-        htcore.mark_step()
-    model.load_state_dict(stat_dict, assign=True)
-    model.to("hpu")
-    htcore.mark_step()
-    logger.info("Quantized model loading successful.")
-    return model
diff --git a/neural_compressor/torch/algorithms/habana_fp8/scale.py b/neural_compressor/torch/algorithms/habana_fp8/scale.py
deleted file mode 100644
index 1dfaee24502..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/scale.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pylint:disable=import-error
-
-import habana_frameworks.torch.core as htcore
-import torch
-
-scale_method_mapping = {}
-
-
-def scale_method_registry(name):
-    def new_scale_method(scale_method_cls):
-        global scale_method_mapping
-        scale_method_mapping[name] = scale_method_cls
-        return scale_method_cls
-
-    return new_scale_method
-
-
-@scale_method_registry("hw")
-def hardware_scale_method(scale):
-    scale_list = torch.tensor([16, 1, 1 / 16, 1 / 256])
-    return torch.clip(
-        2 ** (torch.ceil(torch.log2(scale) / 4) * 4),
-        torch.tensor(scale_list[-1], dtype=scale.dtype, device=scale.device),
-        torch.tensor(scale_list[0], dtype=scale.dtype, device=scale.device),
-    )
-
-
-@scale_method_registry("pow2")
-def pow2_scale_method(scale):
-    return 2 ** torch.ceil(torch.log2(scale))
-
-
-@scale_method_registry("unit")
-def unit_scale_method(scale):
-    return torch.tensor(1.0)
-
-
-@scale_method_registry("self")
-def self_scale_method(scale):
-    return scale
-
-
-def map_gaudi_scale(scale, method):
-    scale_method = scale_method_mapping[method]
-    return scale_method(scale)
diff --git a/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py b/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py
deleted file mode 100644
index 28f108cb636..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp b/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp
deleted file mode 100644
index f22c5c82c89..00000000000
--- a/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//  Copyright (c) 2024 Intel Corporation
-//
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-
-// Temporary implementation of fp8 tensor saving and loading
-// Will remove after Habana torch applies below patch:
-// https://github.com/pytorch/pytorch/pull/114662
-
-
-#include <torch/extension.h>
-
-
-// function prototype declaration
-torch::Tensor to_u8(torch::Tensor tensor);
-torch::Tensor from_u8(torch::Tensor tensor, int choice=1);
-
-
-torch::Tensor to_u8(torch::Tensor tensor) {
-    auto p = tensor.data_ptr();
-    // RuntimeError: HPU device type not enabled.
-    auto options = torch::TensorOptions().device(torch::kCPU).dtype(torch::kUInt8);
-    auto tmp = torch::from_blob(p, tensor.sizes(), options);
-    // copy to avoid memory leak.
-    torch::Tensor tensor_uint8 = torch::empty_like(tensor, torch::kUInt8).copy_(tmp);
-    return tensor_uint8;
-};
-
-
-/*
-choice=1 means torch.float8_e4m3fn;
-others means torch.float8_e5m2;
-*/
-torch::Tensor from_u8(torch::Tensor tensor, int choice) {
-    auto p = tensor.data_ptr();
-    torch::ScalarType dtype;
-    if (choice == 1) {
-        dtype = torch::kFloat8_e4m3fn;
-    }
-    else {
-        dtype = torch::kFloat8_e5m2;
-    }
-    auto options = torch::TensorOptions().device(torch::kCPU).dtype(dtype);
-    auto tmp = torch::from_blob(p, tensor.sizes(), options);
-    // copy to avoid memory leak.
-    torch::Tensor tensor_fp8 = torch::empty_like(tensor, dtype).copy_(tmp);
-    return tensor_fp8;
-};
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("to_u8", &to_u8, "Convert tensor to u8 for saving.");
-    m.def("from_u8", &from_u8, "Recover tensor from u8 for loading.");
-};
diff --git a/neural_compressor/torch/amp/__init__.py b/neural_compressor/torch/amp/__init__.py
deleted file mode 100644
index 87a0c8287d0..00000000000
--- a/neural_compressor/torch/amp/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .autocast import autocast
diff --git a/neural_compressor/torch/amp/autocast.py b/neural_compressor/torch/amp/autocast.py
deleted file mode 100644
index 7375b80c0f5..00000000000
--- a/neural_compressor/torch/amp/autocast.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Any, Optional
-
-import torch
-from torch.types import _dtype
-
-
-class autocast:
-    r"""Instances of :class:`autocast` serve as context managers or decorators that
-    allow regions of your script to run in mixed precision.
-
-    In these regions, ops run in an op-specific dtype chosen by autocast
-    to improve performance while maintaining accuracy.
-
-    When entering an autocast-enabled region, Tensors may be any type.
-    You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting.
-
-    :class:`autocast` should wrap only the forward pass(es) of your network, including the loss
-    computation(s).  Backward passes under autocast are not recommended.
-    Backward ops run in the same type that autocast used for corresponding forward ops.
-
-        # Enables autocasting for the inference pass
-        with torch.autocast(device_type="hpu", dtype=torch.float8_e4m3fn):
-            output = model(input)
-
-    :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model::
-
-        class AutocastModel(nn.Module):
-            ...
-            @torch.autocast(device_type="cuda")
-            def forward(self, input):
-                ...
-
-    The autocast state is thread-local.  If you want it enabled in a new thread, the context manager or decorator
-    must be invoked in that thread.  This affects :class:`torch.nn.DataParallel` and
-    :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process
-    (see :ref:`Working with Multiple GPUs<amp-multigpu>`).
-
-    Args:
-        device_type(str, required):  Device type to use. Possible values are: 'cuda', 'cpu', 'xpu' and 'hpu'.
-                                     The type is the same as the `type` attribute of a :class:`torch.device`.
-                                     Thus, you may obtain the device type of a tensor using `Tensor.device.type`.
-        enabled(bool, optional):  Whether autocasting should be enabled in the region.
-            Default: ``True``
-        dtype(torch_dtype, optional):  Whether to use torch.float16 or torch.bfloat16.
-        cache_enabled(bool, optional):  Whether the weight cache inside autocast should be enabled.
-            Default: ``True``
-    """
-
-    def __init__(
-        self,
-        device_type: str,
-        dtype: Optional[_dtype] = None,
-        enabled: bool = True,
-        cache_enabled: Optional[bool] = None,
-    ):
-        self.device = device_type
-        if dtype is not None:
-            self.fast_dtype = dtype
-        if cache_enabled is not None:
-            self._cache_enabled = cache_enabled
-        if not (device_type == "hpu" and dtype in [torch.float8_e4m3fn, torch.float8_e5m2]):
-            self._autocast = torch.autocast(device_type, dtype, enabled, cache_enabled)
-
-    def __enter__(self) -> None:
-        if self.device == "hpu" and self.fast_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            from neural_compressor.torch.amp.fp8.functions import replace_func
-
-            # This function will replace F.linear and torch.matmul with the fp8 one
-            replace_func(self.fast_dtype)
-        else:
-            self._autocast.__enter__()
-
-    def __exit__(self, exc_type, exc_value, traceback) -> None:
-        if self.device == "hpu" and self.fast_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
-            from neural_compressor.torch.amp.fp8.functions import recover_func
-
-            # This function will recover F.linear and torch.matmul with the original one
-            recover_func()
-        else:
-            self._autocast.__exit__(exc_type, exc_value, traceback)
diff --git a/neural_compressor/torch/amp/fp8/__init__.py b/neural_compressor/torch/amp/fp8/__init__.py
deleted file mode 100644
index 28f108cb636..00000000000
--- a/neural_compressor/torch/amp/fp8/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/neural_compressor/torch/amp/fp8/functions.py b/neural_compressor/torch/amp/fp8/functions.py
deleted file mode 100644
index f8f19a64b17..00000000000
--- a/neural_compressor/torch/amp/fp8/functions.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint:disable=import-error
-
-import os
-
-import habana_frameworks.torch.core as htcore
-import habana_frameworks.torch.hpex
-import torch
-from torch.nn import functional as F
-
-from neural_compressor.torch.algorithms.habana_fp8.observer import calculate_qparams
-from neural_compressor.torch.utils import logger
-
-_F_linear = F.linear
-_torch_matmul = torch.matmul
-_torch_bmm = torch.bmm
-
-
-DATA_TYPE = torch.float8_e4m3fn
-USE_AMAX = bool(os.getenv("PT_USE_FP8_AMAX", False))
-
-
-def fp8_linear_forward(input, weight, bias=None):
-    out_dtype = torch.float32
-    org_middle_shape = input.shape[1:-1]
-    input = input.view((-1, weight.shape[-1]))
-    # process input
-    if input.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = input.dtype
-        if USE_AMAX:
-            input_scale = calculate_qparams(input.min(), input.max(), DATA_TYPE)
-            input_scale_inv = torch.reciprocal(input_scale)
-        else:
-            input_scale, input_scale_inv = None, None
-        input = torch.ops.hpu.cast_to_fp8_v2(input, input_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for input
-        input_scale, input_scale_inv = None, None
-    # process weight
-    if weight.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = weight.dtype
-        if USE_AMAX:
-            weight_scale = calculate_qparams(weight.min(), weight.max(), DATA_TYPE)
-            weight_scale_inv = torch.reciprocal(weight_scale)
-        else:
-            weight_scale, weight_scale_inv = None, None
-        weight = torch.ops.hpu.cast_to_fp8_v2(weight, weight_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for weight
-        weight_scale, weight_scale_inv = None, None
-    out = torch.ops.hpu.fp8_gemm_v2(
-        input,
-        False,
-        weight,
-        True,
-        None,
-        out_dtype,
-        input_scale,
-        weight_scale,
-        bias,
-        False,
-    )
-    out = out.view(-1, *org_middle_shape, out.shape[-1])
-    return out
-
-
-def fp8_matmul(input1, input2):
-    out_dtype = torch.float32
-    # process input1
-    if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = input1.dtype
-        if USE_AMAX:
-            input1_scale = calculate_qparams(input1.min(), input1.max(), DATA_TYPE)
-            input1_scale_inv = torch.reciprocal(input1_scale)
-        else:
-            input1_scale, input1_scale_inv = None, None
-        input1 = torch.ops.hpu.cast_to_fp8_v2(input1, input1_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for input1
-        input1_scale, input1_scale_inv = None, None
-    # process input2
-    if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        out_dtype = input2.dtype
-        if USE_AMAX:
-            input2_scale = calculate_qparams(input2.min(), input2.max(), DATA_TYPE)
-            input2_scale_inv = torch.reciprocal(input2_scale)
-        else:
-            input2_scale, input2_scale_inv = None, None
-        input2 = torch.ops.hpu.cast_to_fp8_v2(input2, input2_scale_inv, False, False, DATA_TYPE)[0]
-    else:
-        # skip cast for input2
-        input2_scale, input2_scale_inv = None, None
-    # calculate
-    out = torch.ops.hpu.fp8_gemm_v2(
-        input1,
-        False,
-        input2,
-        False,
-        None,
-        out_dtype,
-        input1_scale,
-        input2_scale,
-        None,
-        False,
-    )
-    return out
-
-
-def replace_func(dtype):
-    global DATA_TYPE
-    DATA_TYPE = dtype
-    F.linear = fp8_linear_forward
-    torch.matmul = fp8_matmul
-    torch.bmm = fp8_matmul
-    logger.debug("F.linear and torch.matmul are replaced with the fp8 one")
-
-
-def recover_func():
-    F.linear = _F_linear
-    torch.matmul = _torch_matmul
-    torch.bmm = _torch_bmm
-    logger.debug("F.linear and torch.matmul are recovered")
diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py
index f6a015eb89f..64d7816ad81 100644
--- a/neural_compressor/torch/quantization/__init__.py
+++ b/neural_compressor/torch/quantization/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """Intel Neural Compressor Pytorch quantization API."""
 
-from neural_compressor.torch.quantization.quantize import quantize, prepare, convert
+from neural_compressor.torch.quantization.quantize import quantize, prepare, convert, finalize_calibration
 from neural_compressor.torch.quantization.config import (
     RTNConfig,
     get_default_rtn_config,
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index c86c604152c..82edd0c610d 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -678,20 +678,22 @@ def hqq_entry(
 
 
 ###################### Habana FP8 Algo Entry ##################################
-from neural_compressor.torch.utils import is_hpex_available
-
-if is_hpex_available():
-    from neural_compressor.torch.algorithms.habana_fp8 import quantize, save
-
-    @register_algo(FP8_QUANT)
-    def fp8_quant_entry(
-        model: torch.nn.Module, configs_mapping: Dict[Tuple[str], FP8Config], *args, **kwargs
-    ) -> torch.nn.Module:
-        kwargs.pop("example_inputs")
-        model = quantize(model, configs_mapping, *args, **kwargs)
-        model.qconfig = configs_mapping
-        model.save = MethodType(save, model)
-        return model
+@register_algo(FP8_QUANT)
+@torch.no_grad()
+def fp8_entry(
+    model: torch.nn.Module,
+    configs_mapping: Dict[Tuple[str], FP8Config],
+    mode: Mode = Mode.QUANTIZE,
+    *args,
+    **kwargs,
+) -> torch.nn.Module:
+    """The main entry to apply fp8 quantization."""
+    from neural_compressor.torch.algorithms.fp8_quant import FP8Quantizer
+
+    quantizer = get_quantizer(model, quantizer_cls=FP8Quantizer, quant_config=configs_mapping)
+    model = quantizer.execute(model, mode=mode)
+    postprocess_model(model, mode, quantizer)
+    return model
 
 
 ###################### MX Quant Algo Entry ##################################
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 29f944b93e3..ecc18848e52 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -18,6 +18,8 @@
 """Intel Neural Compressor Pytorch quantization config API."""
 
 
+import json
+import importlib
 from collections import OrderedDict
 from typing import Callable, Dict, List, NamedTuple, Optional
 from typing import OrderedDict as OrderedDictType
@@ -1606,81 +1608,142 @@ def get_default_hqq_config() -> HQQConfig:
     return HQQConfig()
 
 
-######################## FP8 Config ###############################
+######################## FP8 Quant Config ###############################
+# refer to habana_quantization_toolkit/_core/common.py
+FP8_WHITE_LIST = (
+        "Matmul", "Linear", "FalconLinear", "KVCache", "Conv2d",
+        "LoRACompatibleLinear", "LoRACompatibleConv", "Softmax", "ModuleFusedSDPA")
+if importlib.util.find_spec("deepspeed"):
+    FP8_WHITE_LIST.append(
+        "LinearLayer", "LinearAllreduce","ScopedLinearAllReduce", "LmHeadLinearAllreduce")
+
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=FP8_QUANT)
 class FP8Config(TorchBaseConfig):
     """Config class for FP8 quantization."""
 
     name = FP8_QUANT
-    supported_configs: List[OperatorConfig] = []
+
+    # tunable params
     params_list = [
-        "w_dtype",
-        "w_observer",
-        "act_dtype",
-        "act_observer",
-        "approach",
-        "device",
+        "fp8_config",
+        "scale_method",
+        "observer",
+        "measure_exclude",
     ]
 
     def __init__(
         self,
-        w_dtype: str = "fp8_e4m3",
-        w_observer: Union[str, List[str]] = "minmax_per_channel",
-        act_dtype: str = "fp8_e4m3",
-        act_observer: Union[str, List[str]] = "minmax",
-        approach: Union[str, List[str]] = "static",
-        device: Union[str, List[str]] = "hpu",
-        white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST,
+        dump_stats_path: str = "./hqt_output/measure",
+        fp8_config: str = "E4M3",
+        hp_dtype: torch.dtype = torch.bfloat16,
+        blocklist: dict = {'names': [], 'types': ()},
+        allowlist: dict = {'names': [], 'types': FP8_WHITE_LIST},
+        mode: str = "AUTO",
+        scale_method: str = "maxabs_hw",
+        scale_params: dict = {},
+        observer: str = "maxabs",
+        mod_dict: dict = {},
+        measure_exclude: str = "OUTPUT",
+        **kwargs,
     ):
-        """Init FP8 config.
+        """Init FP8 config."""
+        super().__init__()
+        self.dump_stats_path =dump_stats_path
+        self.fp8_config = fp8_config
+        self.hp_dtype = hp_dtype
+        self.blocklist = blocklist
+        self.allowlist = allowlist
+        self.mode = mode
+        self.scale_method = scale_method
+        self.scale_params = scale_params
+        self.observer = observer
+        self.mod_dict = mod_dict
+        self._json_file = None
+
+    @property
+    def measure(self):
+        return self.mode == "MEASURE"
+
+    @property
+    def quantize(self):
+        return self.mode == "QUANTIZE"
+
+    @property
+    def json_file(self):
+        if self._json_file is None:
+            import tempfile
+            from pathlib import Path
+
+            json_file_tmp = tempfile.NamedTemporaryFile(suffix=".json")
+            self.to_json_file(json_file_tmp.name)
+            self.json_file(json_file_tmp.name)
+        return self._json_file
+
+    @json_file.setter
+    def json_file(self, json_file):
+        self._json_file = json_file
 
-        Args:
-        """
-        super().__init__(white_list=white_list)
-        self.w_dtype = w_dtype
-        self.w_observer = w_observer
-        self.act_dtype = act_dtype
-        self.act_observer = act_observer
-        self.approach = approach
-        self.device = device
-        self._post_init()
+    @classmethod
+    def from_json_file(cls, filename):
+        with open(filename, "r", encoding="utf-8") as file:
+            config_dict = json.load(file)
+        config = cls.from_dict(config_dict)
+        config.json_file = filename
+        return config
 
     @classmethod
-    def register_supported_configs(cls) -> List[OperatorConfig]:
+    def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]:
+        # just a simple example here
+        # usually write parameter combinations that are more suitable to tune based on experience.
+        return FP8Config(
+            fp8_config=["E4M3", "E5M2"],
+            scale_method=["without_scale", "maxabs_hw"],
+            measure_exclude=["NONE", "OUTPUT"])
+
+    @classmethod
+    def register_supported_configs(cls):
+        """Add all supported configs."""
         supported_configs = []
-        fp8_config = FP8Config(
-            w_dtype=["fp8_e5m2", "fp8_e4m3"],
-            w_observer=["minmax", "minmax_per_channel"],
-            act_dtype=["fp8_e5m2", "fp8_e4m3"],
-            act_observer=["minmax", "kl"],
-            approach=["static", "dynamic"],
-            device=["hpu"],
+        linear_rtn_config = FP8Config(
+            mode=["AUTO", "MEASURE", "QUANTIZE"],
+            fp8_config=["E4M3", "E5M2"],
+            scale_method=["without_scale", "unit_scale", "max", "maxabs_hw",
+                "maxabs_pow2", "maxabs_hw_opt_weight", "maxabs_pow2_opt_weight",
+                "smoothquant_weights_output_channel_maxabs_pow2",
+                "weaksmoothquant_weights_output_channel_maxabs_pow2",
+                "act_maxabs_hw_weights_pcs_maxabs_pow2",
+                "act_maxabs_hw_weights_pcs_opt_pow2",
+                "act_maxabs_pow2_weights_pcs_maxabs_pow2",
+                "act_maxabs_pow2_weights_pcs_opt_pow2",
+                "smoothquant_opt"],
+            observer=["shape", "maxabs", "maxabs_per_channel", "save"],
+            measure_exclude=["NONE", "OUTPUT", "INPUT", "ALL"],
         )
-        if is_hpex_available():
-            from neural_compressor.torch.algorithms.habana_fp8 import white_list
-
-            operators = white_list
-        else:
-            operators = ()
-        supported_configs.append(OperatorConfig(config=fp8_config, operators=operators))
+        operators = list(FP8_WHITE_LIST)
+        supported_configs.append(OperatorConfig(config=linear_rtn_config, operators=operators))
         cls.supported_configs = supported_configs
 
     @staticmethod
     def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
-        from neural_compressor.torch.algorithms.habana_fp8 import white_list
-
         filter_result = []
         for op_name, module in model.named_modules():
-            if isinstance(module, white_list):
-                pair = (op_name, type(module).__name__)
+            if module.__class__.__name__ in FP8_WHITE_LIST or \
+            module.__class__.__name__.split("Patched")[-1] in FP8_WHITE_LIST:
+                pair = (op_name, module.__class__.__name__)
                 filter_result.append(pair)
         logger.debug(f"Get model info: {filter_result}")
         return filter_result
 
-    @classmethod
-    def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]:
-        # TODO fwk owner needs to update it.
-        return FP8Config(act_observer=["minmax", "kl"])
+    def to_config_mapping(
+        self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
+    ):
+        config_mapping = OrderedDict()
+        if config_list is None:
+            config_list = [self]
+        for config in config_list:
+            for op_name, op_type in model_info:
+                config_mapping[(op_name, op_type)] = self
+        return config_mapping
 
 
 def get_default_fp8_config() -> FP8Config:
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 85e73d47078..08e8d7c889d 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -20,7 +20,7 @@
 
 from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry
 from neural_compressor.common.utils import Mode, call_counter, log_process
-from neural_compressor.torch.quantization.config import SmoothQuantConfig, StaticQuantConfig
+from neural_compressor.torch.quantization.config import SmoothQuantConfig, StaticQuantConfig, FP8Config
 from neural_compressor.torch.utils import is_ipex_available, logger
 from neural_compressor.torch.utils.utility import WHITE_MODULE_LIST, algos_mapping, get_model_info
 
@@ -62,8 +62,8 @@ def quantize(
         assert isinstance(
             quant_config, BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info("Quantize model with config:")
-    logger.info(quant_config.to_dict())
+    logger.debug("Quantize model with config:")
+    logger.debug(quant_config.to_dict())
     # select quantization algo according to config
 
     if is_ipex_available and (
@@ -132,8 +132,8 @@ def prepare(
         assert isinstance(
             quant_config, BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info("Prepare model with config:")
-    logger.info(quant_config.to_dict())
+    logger.debug("Prepare model with config:")
+    logger.debug(quant_config.to_dict())
 
     # select quantization algo according to config
     if is_ipex_available and (
@@ -179,8 +179,9 @@ def convert(
     """
     q_model = model if inplace else copy.deepcopy(model)
 
-    # TODO: Optimize the check for prepared flag after adding HQT FP8 Quant
-    assert getattr(model, "is_prepared", False), "Please run prepare function before convert."
+    assert (
+        getattr(model, "is_prepared", False) or quant_config is not None
+    ), "Please pass quant_config to convert function."
 
     if getattr(model, "is_prepared", False):
         if quant_config is None:
@@ -195,8 +196,8 @@ def convert(
         assert isinstance(
             quant_config, BaseConfig
         ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}."
-    logger.info("Convert model with config:")
-    logger.info(quant_config.to_dict())
+    logger.debug("Convert model with config:")
+    logger.debug(quant_config.to_dict())
 
     # select quantization algo according to config
     if is_ipex_available and (
@@ -220,3 +221,12 @@ def convert(
             )
     setattr(q_model, "is_quantized", True)
     return q_model
+
+
+def finalize_calibration(model):
+    if hasattr(model, "quant_config") and isinstance(model.quant_config, FP8Config): # FP8
+        from neural_compressor.torch.algorithms.fp8_quant import save_calib_result
+
+        save_calib_result(model)
+    else:
+        raise NotImplementedError("`finalize_calibration` only supports FP8 measurement now.")
diff --git a/setup.py b/setup.py
index a2392358572..ebabaa97b78 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,8 @@ def get_build_version():
         return __version__
     try:
         result = subprocess.run(["git", "describe", "--tags"], capture_output=True, text=True, check=True)
-        _, distance, commit = result.stdout.strip().split("-")
+        distance = result.stdout.strip().split("-")[-2]
+        commit = result.stdout.strip().split("-")[-1]
         return f"{__version__}.dev{distance}+{commit}"
     except subprocess.CalledProcessError:
         return __version__
diff --git a/test/3x/torch/amp/test_fp8_amp.py b/test/3x/torch/amp/test_fp8_amp.py
deleted file mode 100644
index a5212467723..00000000000
--- a/test/3x/torch/amp/test_fp8_amp.py
+++ /dev/null
@@ -1,75 +0,0 @@
-import copy
-import os
-import shutil
-import unittest
-
-import torch
-
-from neural_compressor.torch.amp import autocast
-from neural_compressor.torch.utils import is_hpex_available
-
-# if not is_hpex_available():
-#     exit()
-
-
-class M(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.fc1 = torch.nn.Linear(10, 5)
-        self.fc2 = torch.nn.Linear(5, 10)
-
-    def forward(self, inp):
-        x1 = self.fc1(inp)
-        x2 = self.fc2(x1)
-        x3 = torch.matmul(inp.T, x2)
-        x3 = x3.unsqueeze(0)
-        x3 = torch.bmm(x3, x3)
-        return x3
-
-
-@unittest.skipIf(not is_hpex_available(), "HPEX is required for HPU inference")
-class TestPytorchFP8Adaptor(unittest.TestCase):
-    @classmethod
-    def setUpClass(self):
-        self.model = M().to("hpu")
-        self.inp = torch.randn(1, 10).to("hpu")
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("./.graph_dumps", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
-    def test_autocast(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        with autocast("hpu", dtype=torch.bfloat16) and torch.no_grad():
-            bf16_out = m(inp)
-            print("BF16 MSE:", (bf16_out - fp32_out).pow(2).sum())
-
-        with autocast("hpu", dtype=torch.float8_e5m2) and torch.no_grad():
-            e5m2_out = m(inp)
-            print("FP8_E5M2 MSE:", (e5m2_out - fp32_out).pow(2).sum())
-
-        with autocast("hpu", dtype=torch.float8_e4m3fn) and torch.no_grad():
-            e4m3_out = m(inp)
-            print("FP8_E4M3 MSE:", (e4m3_out - fp32_out).pow(2).sum())
-
-    def test_autocast_use_amax(self):
-        os.environ["PT_USE_FP8_AMAX"] = str(1)
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        with autocast("hpu", dtype=torch.float8_e5m2) and torch.no_grad():
-            e5m2_out = m(inp)
-            print("FP8_E5M2 using amax MSE:", (e5m2_out - fp32_out).pow(2).sum())
-
-        with autocast("hpu", dtype=torch.float8_e4m3fn) and torch.no_grad():
-            e4m3_out = m(inp)
-            print("FP8_E4M3 using amax MSE:", (e4m3_out - fp32_out).pow(2).sum())
-        os.environ.pop("PT_USE_FP8_AMAX", None)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/3x/torch/quantization/habana_fp8/test_fp8.py b/test/3x/torch/quantization/habana_fp8/test_fp8.py
deleted file mode 100644
index 8fafc302f65..00000000000
--- a/test/3x/torch/quantization/habana_fp8/test_fp8.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import copy
-import shutil
-
-import pytest
-import torch
-
-from neural_compressor.torch.utils import is_hpex_available
-
-if is_hpex_available():
-    from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic
-    from neural_compressor.torch.algorithms.habana_fp8.modules import (
-        BatchMatmul,
-        FP8BatchMatmul,
-        FP8DynamicBatchMatmul,
-        FP8DynamicLinear,
-        FP8DynamicMatmul,
-        FP8Linear,
-        FP8Matmul,
-        Matmul,
-    )
-    from neural_compressor.torch.quantization import (
-        FP8Config,
-        TuningConfig,
-        autotune,
-        get_default_fp8_config,
-        get_default_fp8_config_set,
-        quantize,
-    )
-
-    torch.set_grad_enabled(False)
-
-
-class M(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.fc1 = torch.nn.Linear(10, 5)
-        self.fc2 = torch.nn.Linear(5, 10)
-        self.mm = Matmul()
-        self.bmm = BatchMatmul()
-
-    def forward(self, inp):
-        x1 = self.fc1(inp)
-        x2 = self.fc2(x1)
-        x3 = self.mm(inp.T, x2)
-        x3 = x3.unsqueeze(0)
-        x4 = self.mm(inp.T, x2)
-        x4 = x4.unsqueeze(0) + 1  ## SW-178838
-        x5 = self.bmm(x3, x4)
-        x6 = self.bmm(x3, x4)
-        out = x5 + x6
-        return out
-
-
-@pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
-class TestPytorchFP8Adaptor:
-    def setup_class(self):
-        self.model = M().to("hpu")
-        self.inp = torch.randn(1, 10).to("hpu")
-        self.fp32_out = self.model(self.inp)
-
-    def teardown_class(self):
-        shutil.rmtree("./saved", ignore_errors=True)
-        shutil.rmtree("./.graph_dumps", ignore_errors=True)
-        shutil.rmtree("runs", ignore_errors=True)
-
-    def test_dynamic_accu(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        m = quantize_dynamic(m, dtype="fp8_e5m2", inplace=True)
-        assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check."
-        print(m)
-        fp8_out = m(inp)
-        print("Dynamic quantization FP8_E5M2 MSE:", (fp32_out - fp8_out).pow(2).sum())
-
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        m = quantize_dynamic(m, dtype="fp8_e4m3", inplace=True)
-        assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check."
-        print(m)
-        fp8_out = m(inp)
-        print("Dynamic quantization FP8_E4M3 MSE:", (fp32_out - fp8_out).pow(2).sum())
-
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-        qconfig = FP8Config(approach="dynamic")
-        m = quantize(m, qconfig, inplace=True)
-        assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check."
-        print(m)
-        fp8_out = m(inp)
-        print("Dynamic quantization FP8_E4M3 MSE:", (fp32_out - fp8_out).pow(2).sum())
-
-    @pytest.mark.parametrize("dtype", ["fp8_e5m2", "fp8_e4m3"])
-    @pytest.mark.parametrize("w_observer", ["minmax", "minmax_per_channel"])
-    @pytest.mark.parametrize("act_observer", ["minmax", "kl"])
-    def test_static_accu(self, dtype, w_observer, act_observer):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        qconfig = FP8Config(
-            w_dtype=dtype, w_observer=w_observer, act_dtype=dtype, act_observer=act_observer, approach="static"
-        )
-
-        def calib_func(model):
-            model(inp)
-
-        m = quantize(m, qconfig, run_fn=calib_func, inplace=True)
-        assert isinstance(m.fc1, FP8Linear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8Matmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8BatchMatmul), "Unexpected result. Please double check."
-        fp8_out = m(inp)
-        print("Static quantization config:", dtype, w_observer, act_observer)
-        print("Static quantization MSE:", (self.fp32_out - fp8_out).pow(2).sum())
-
-    def test_convert(self):
-        # Temporary implementation of fp8 tensor saving and loading
-        # Will remove after Habana torch applies below patch:
-        # https://github.com/pytorch/pytorch/pull/114662
-        # e4m3
-        fp8_inp = torch.ops.hpu.cast_to_fp8_v2(self.inp, 500, dtype=torch.float8_e4m3fn)[0].to("cpu")
-        import fp8_convert
-
-        int8_inp = fp8_convert.to_u8(fp8_inp)
-        torch.save(int8_inp, "tmp.pt")
-        saved_int8_inp = torch.load("tmp.pt")
-        recovered_inp = fp8_convert.from_u8(saved_int8_inp, 1)
-        assert (fp8_inp == recovered_inp).all(), "Unexpected result. Please double check."
-        # e5m2
-        fp8_inp = torch.ops.hpu.cast_to_fp8_v2(self.inp, 500, dtype=torch.float8_e5m2)[0].to("cpu")
-        int8_inp = fp8_convert.to_u8(fp8_inp)
-        recovered_inp = fp8_convert.from_u8(int8_inp, 0)
-        assert (fp8_inp == recovered_inp).all(), "Unexpected result. Please double check."
-
-    def test_save_load(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        qconfig = get_default_fp8_config()
-
-        def calib_func(model):
-            model(inp)
-
-        m = quantize(m, qconfig, run_fn=calib_func, inplace=True)
-        fp8_out = m(inp)
-        m.save("saved_results")
-
-        from neural_compressor.torch.quantization import load
-
-        m = copy.deepcopy(self.model)
-        m = load("saved_results", m)
-        recovered_out = m(inp)
-        assert (recovered_out == fp8_out).all(), "Unexpected result. Please double check."
-        assert isinstance(m.fc1, FP8Linear), "Unexpected result. Please double check."
-        assert isinstance(m.mm, FP8Matmul), "Unexpected result. Please double check."
-        assert isinstance(m.bmm, FP8BatchMatmul), "Unexpected result. Please double check."
-
-    def test_autotune(self):
-        m = copy.deepcopy(self.model)
-        inp = self.inp
-        fp32_out = m(inp)
-
-        def calib_func(model):
-            model(inp)
-
-        accu_list = [1.0, 0.9, 0.99]
-
-        def eval_func(model):
-            nonlocal accu_list
-            return accu_list.pop()
-
-        tune_config = TuningConfig(
-            config_set=get_default_fp8_config_set(),
-            tolerable_loss=0.01,
-        )
-        best_model = autotune(
-            model=m,
-            tune_config=tune_config,
-            run_fn=calib_func,
-            eval_fns=eval_func,
-        )
-        assert isinstance(best_model.fc1, FP8Linear), "Unexpected result. Please double check."
-        assert isinstance(best_model.mm, FP8Matmul), "Unexpected result. Please double check."
-        assert isinstance(best_model.bmm, FP8BatchMatmul), "Unexpected result. Please double check."

From ca1444bd80c5a0b2ee16185cf6962e4b6a3f8c93 Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Wed, 19 Jun 2024 15:05:12 +0300
Subject: [PATCH 04/51] [SW-189361] Fix white list extend

Change-Id: Ic2021c248798fce37710d28014a6d59259c868a3
---
 neural_compressor/torch/quantization/config.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index ecc18848e52..d8aefe1f3ff 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -1610,12 +1610,12 @@ def get_default_hqq_config() -> HQQConfig:
 
 ######################## FP8 Quant Config ###############################
 # refer to habana_quantization_toolkit/_core/common.py
-FP8_WHITE_LIST = (
+FP8_WHITE_LIST = [
         "Matmul", "Linear", "FalconLinear", "KVCache", "Conv2d",
-        "LoRACompatibleLinear", "LoRACompatibleConv", "Softmax", "ModuleFusedSDPA")
+        "LoRACompatibleLinear", "LoRACompatibleConv", "Softmax", "ModuleFusedSDPA"]
 if importlib.util.find_spec("deepspeed"):
-    FP8_WHITE_LIST.append(
-        "LinearLayer", "LinearAllreduce","ScopedLinearAllReduce", "LmHeadLinearAllreduce")
+    FP8_WHITE_LIST.extend(
+        ["LinearLayer", "LinearAllreduce","ScopedLinearAllReduce", "LmHeadLinearAllreduce"])
 
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=FP8_QUANT)
 class FP8Config(TorchBaseConfig):

From dfec104431916e5d0f6e6de0cf411c45d3f6e7b7 Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Wed, 3 Jul 2024 17:22:02 +0300
Subject: [PATCH 05/51] [SW-191317] Raise exception according to hqt config
 object

Change-Id: I06ba8fa912c811c88912987c11e5c12ef328348a
---
 neural_compressor/torch/algorithms/fp8_quant/common.py | 8 +++++++-
 neural_compressor/torch/quantization/quantize.py       | 4 ----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py
index b038a367a78..4a603c677ac 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/common.py
@@ -24,7 +24,13 @@
 
 def save_calib_result(model):
     import habana_quantization_toolkit as hqt
-    hqt.finish_measurements(model)
+    if (hasattr(model, "__hqt_config__") and
+            isinstance(model.__hqt_config__, hqt._quant_common.quant_config.Fp8cfg)):
+        # TODO SW-184714 modify hqt notation to inc notation once code is ported
+        hqt.finish_measurements(model)
+    else:
+        raise NotImplementedError("Saving calibration results currently supported only in HPU.")
+
 
 
 def update_mode(config_path, measure_step=False, quant_step=False):
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 08e8d7c889d..5c161e5bb8b 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -224,9 +224,5 @@ def convert(
 
 
 def finalize_calibration(model):
-    if hasattr(model, "quant_config") and isinstance(model.quant_config, FP8Config): # FP8
         from neural_compressor.torch.algorithms.fp8_quant import save_calib_result
-
         save_calib_result(model)
-    else:
-        raise NotImplementedError("`finalize_calibration` only supports FP8 measurement now.")

From 216d94b278cbb62b425fadc37161ecacd5f09449 Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Sat, 6 Jul 2024 20:06:08 +0300
Subject: [PATCH 06/51] [SW-184714] Port HQT code into INC

HQT lib content was copied as is under fp8_quant

Tests were copied to 3.x torch location

Change-Id: Iec6e1fa7ac4bf1df1c95b429524c40e32bc13ac9
---
 .../torch/algorithms/fp8_quant/__init__.py    |   1 +
 .../algorithms/fp8_quant/_core/__init__.py    |   0
 .../algorithms/fp8_quant/_core/common.py      | 255 ++++++
 .../algorithms/fp8_quant/_core/fp_utils.py    | 172 ++++
 .../algorithms/fp8_quant/_core/measure.py     | 419 +++++++++
 .../fp8_quant/_core/quant_dequant.py          |  55 ++
 .../algorithms/fp8_quant/_core/quantize.py    |  96 +++
 .../torch/algorithms/fp8_quant/_core/scale.py | 438 ++++++++++
 .../fp8_quant/_core/scale_methods/__init__.py |   3 +
 .../fp8_quant/_core/scale_methods/max_abs.py  | 397 +++++++++
 .../_core/scale_methods/smooth_quant.py       | 118 +++
 .../_core/scale_methods/unit_scale.py         |  52 ++
 .../torch/algorithms/fp8_quant/_core/utils.py |  49 ++
 .../fp8_quant/_quant_common/__init__.py       |   0
 .../fp8_quant/_quant_common/helper_modules.py | 812 ++++++++++++++++++
 .../fp8_quant/_quant_common/quant_config.py   | 250 ++++++
 .../torch/algorithms/fp8_quant/common.py      |   8 +-
 .../custom_config/custom_example.json         |   5 +
 .../custom_config/llama_measure.json          |  14 +
 .../fp8_quant/custom_config/llama_quant.json  |  17 +
 .../custom_config/measure_config.json         |  12 +
 .../fp8_quant/custom_config/quant_config.json |  13 +
 .../torch/algorithms/fp8_quant/fp8_quant.py   |   7 +-
 .../fp8_quant/prepare_quant/__init__.py       |   0
 .../fp8_quant/prepare_quant/prepare_model.py  |  36 +
 .../algorithms/fp8_quant/scripts/__init__.py  |   0
 .../scripts/regression_detection/__init__.py  |   0
 .../regression_detection/golden_metrics.json  |  74 ++
 .../regression_detection.py                   | 117 +++
 .../algorithms/fp8_quant/utils/__init__.py    |   0
 .../algorithms/fp8_quant/utils/logger.py      | 240 ++++++
 .../3x/torch/algorithms/fp8_quant/__init__.py |   6 +
 .../3x/torch/algorithms/fp8_quant/conftest.py |  12 +
 .../torch/algorithms/fp8_quant/fp8_tests.py   | 174 ++++
 test/3x/torch/algorithms/fp8_quant/pytest.ini |   3 +
 .../fp8_quant/test_jsons/test_hw_quant.json   |  16 +
 ...st_hw_quant_ignored_unmeasured_models.json |  17 +
 .../fp8_quant/test_jsons/test_measure.json    |  13 +
 .../fp8_quant/test_jsons/test_pow2_quant.json |  16 +
 .../fp8_quant/test_jsons/test_unit_quant.json |  16 +
 test/3x/torch/algorithms/fp8_quant/tester.py  | 218 +++++
 .../fp8_quant/unit_tests/__init__.py          |   6 +
 .../fp8_quant/unit_tests/test_deepspeed.py    |  86 ++
 .../test_functions/test_config_json.py        |  29 +
 .../test_functions/test_matmul_fp8.py         |  71 ++
 .../unit_tests/test_layers/test_conv2d.py     |  40 +
 .../unit_tests/test_layers/test_linear.py     |  33 +
 .../unit_tests/test_layers/test_matmul.py     |  56 ++
 48 files changed, 4465 insertions(+), 7 deletions(-)
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/common.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_quant_common/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/custom_config/custom_example.json
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_measure.json
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_quant.json
 create mode 100755 neural_compressor/torch/algorithms/fp8_quant/custom_config/measure_config.json
 create mode 100755 neural_compressor/torch/algorithms/fp8_quant/custom_config/quant_config.json
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/prepare_quant/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/scripts/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/utils/__init__.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/utils/logger.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/__init__.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/conftest.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/fp8_tests.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/pytest.ini
 create mode 100644 test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant.json
 create mode 100644 test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant_ignored_unmeasured_models.json
 create mode 100644 test/3x/torch/algorithms/fp8_quant/test_jsons/test_measure.json
 create mode 100644 test/3x/torch/algorithms/fp8_quant/test_jsons/test_pow2_quant.json
 create mode 100644 test/3x/torch/algorithms/fp8_quant/test_jsons/test_unit_quant.json
 create mode 100644 test/3x/torch/algorithms/fp8_quant/tester.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
 create mode 100644 test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py

diff --git a/neural_compressor/torch/algorithms/fp8_quant/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/__init__.py
index d16760b5e81..bea97db811c 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/__init__.py
@@ -18,4 +18,5 @@
     restore_patched_module,
     with_patched_module,
 )
+from neural_compressor.torch.algorithms.fp8_quant.prepare_quant.prepare_model import finish_measurements, prep_model
 from neural_compressor.torch.algorithms.fp8_quant.fp8_quant import FP8Quantizer
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_core/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
new file mode 100644
index 00000000000..c155146dcc6
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -0,0 +1,255 @@
+import os
+import torch
+import json
+import numpy as np
+import functools
+import importlib.util
+
+from .._quant_common.helper_modules import *
+from .._quant_common.quant_config import get_hqt_config
+from ..utils.logger import logger
+
+deepspeed_exists = False
+if importlib.util.find_spec("deepspeed"):  # check if deepspeed is installed
+    deepspeed_exists = True
+
+UNMEASURED_MODELS = "UnmeasuredModels"
+
+
+class ModuleInfo:
+    def __init__(self, type, patched_module):
+        self.type = type
+        self.patched_module = patched_module
+
+
+class ModuleConfig:
+    def __init__(self, inputs=(None,), outputs=(None,), params=None):
+        self.inputs = inputs
+        self.outputs = outputs
+        self.params = params if params is not None else {}
+
+
+class ModuleExtraConfig:
+    def __init__(self, inputs=(None,), outputs=(None,), params=None, scale=None, config_params=None):
+        self.inputs = inputs
+        self.outputs = outputs
+        self.params = params if params is not None else {}
+        self.scale = scale
+        self.config_params = config_params if config_params is not None else {}
+
+
+class ModuleType:
+    def __init__(self, num_inputs, param_names, num_outputs, required_output):
+        self.num_inputs = num_inputs
+        self.param_names = param_names
+        self.num_outputs = num_outputs
+        self.required_output = required_output
+
+
+mod_types = {
+    "linear": ModuleType(1, ["weight"], 1, False),
+    "matmul": ModuleType(2, [], 1, False),
+    "kv_cache": ModuleType(1, [], 1, False),
+    "softmax": ModuleType(1, [], 1, True),
+    "fused_sdpa": ModuleType(3, [], 2, True),
+}
+descale_fcn = lambda x, scale: torch.mul(x, scale)
+scale_fcn = lambda x, scale: torch.div(x, scale)
+mat_scale_fcn = lambda x, scale_col, scale_row: torch.div(torch.div(x, scale_col), scale_row)
+cast_fcn = lambda x, dtype: x.to(dtype=dtype)
+cast_to_fp8_fcn = lambda x, dtype, scale_inv=None: torch.ops.hpu.cast_to_fp8_v2(x, scale_inv, False, False, dtype)[0]
+cast_from_fp8_fcn = lambda x, dtype, scale=None: torch.ops.hpu.cast_from_fp8(x, scale, dtype)
+
+
+class ShapeList:
+    data = None
+
+
+def rec_fn(x, fn):
+    if isinstance(x, dict):
+        return {k: rec_fn(x[k], fn) for k in x}
+    elif isinstance(x, list):
+        return [rec_fn(k, fn) for k in x]
+    elif isinstance(x, tuple):
+        return tuple([rec_fn(k, fn) for k in x])
+    else:
+        return fn(x)
+
+
+def np_to_pt(x):
+    return rec_fn(x, lambda x: torch.tensor(x) if isinstance(x, np.ndarray) else x)
+
+
+def pt_to_np(x):
+    return rec_fn(
+        x,
+        lambda x: (x.detach().cpu().float().numpy() if isinstance(x, torch.Tensor) else x),
+    )
+
+
+def np_to_list(x):
+    return rec_fn(x, lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
+
+
+def list_to_np(x):
+    return rec_fn(x, lambda x: np.array(x) if isinstance(x, list) else x)
+
+
+def save_json(d, fname):
+    with open(fname, "w") as f:
+        json.dump(d, f, indent=4)
+
+
+def load_json(fname):
+    with open(fname, "r") as f:
+        d = json.load(f)
+    return d
+
+
+def save_npz(d, fname):
+    np.savez(fname, d)
+
+
+def load_npz(fname):
+    d = np.load(fname, allow_pickle=True)
+    return d["arr_0"].item()
+
+
+def save_file(model, d, source_format, fname, mode):
+    config = get_hqt_config(model)
+    logger.debug("Saving %s file: %s", mode, fname)
+    ext = os.path.splitext(fname)[1]
+    target_format = file_functions[ext][0]
+    dc = rec_fn(d, format_functions[(source_format, target_format)])
+    df = {
+        "GlobalRank": config.cfg["global_rank"],
+        "LocalRank": config.cfg["local_rank"],
+        "Mode": mode,
+        "Nodes": dc,
+    }
+    try:
+        file_functions[ext][1](df, fname)
+    except:
+        pass
+
+
+# convert module config data to other format
+def module_convert(m, fcn):
+    mt = ModuleConfig(
+        tuple([fcn(x) for x in m.inputs]),
+        tuple([fcn(m.outputs)],) if type(m.outputs) == np.ndarray else tuple([fcn(y) for y in m.outputs]),
+        {k: fcn(m.params[k]) for k in m.params},
+    )
+    return mt
+
+
+def fix_fields(d):
+    if "input" in d:
+        d["inputs"] = d.pop("input")
+    if "output" in d:
+        d["outputs"] = d.pop("output")
+    return d
+
+
+def load_file(fname, target_format, fail_on_file_not_exist):
+    logger.debug("Loading file: %s", fname)
+    ext = os.path.splitext(fname)[1]
+    source_format = file_functions[ext][0]
+    d = {}
+    if os.path.isfile(fname):
+        d = file_functions[ext][2](fname)
+    elif fail_on_file_not_exist:
+        raise FileNotFoundError(f"Failed to load file {fname}")
+    if "Nodes" in d:
+        dc = {k: ModuleConfig(**fix_fields(d["Nodes"][k])) for k in d["Nodes"]}
+        dc = {k: module_convert(dc[k], format_functions[(source_format, target_format)]) for k in dc}
+    else:
+        dc = {}
+    return dc
+
+
+def save_scales(model, d, source_format, fname):
+    dc = {k: d[k].__dict__ for k in d}
+    save_file(model, dc, source_format, fname, "Scale")
+
+
+def load_scales(fname, target_format):
+    logger.debug("Loading scales file %s", fname)
+    d = load_file(fname, target_format, False)
+    return d
+
+
+def convert_scales_to_tensors_dict(scales_obj, scales_file_format, hp_dtype):
+    scales_temp = {k: scales_obj[k].__dict__ for k in scales_obj}
+    scales_temp = format_functions_rec((scales_file_format, torch.Tensor))(scales_temp)
+    scales_temp = rec_fn(scales_temp, lambda x: x.to(dtype=hp_dtype, device="hpu"))
+    scales = {k: ModuleConfig(**scales_temp[k]) for k in scales_temp}
+    return scales
+
+
+file_functions = {
+    ".json": (list, save_json, load_json),
+    ".npz": (np.ndarray, save_npz, load_npz),
+}
+
+format_functions = {
+    (torch.Tensor, torch.Tensor): lambda x: x,
+    (np.ndarray, np.ndarray): lambda x: x,
+    (list, list): lambda x: x,
+    (torch.Tensor, np.ndarray): lambda x: x.detach().cpu().float().numpy(),
+    (torch.Tensor, list): lambda x: x.detach().cpu().float().numpy().tolist(),
+    (np.ndarray, torch.Tensor): torch.tensor,
+    (np.ndarray, list): lambda x: x.tolist(),
+    (list, torch.Tensor): torch.tensor,
+    (list, np.ndarray): lambda x: np.array(x),
+    (list, ShapeList): lambda x: [int(s) for s in x[0]],
+}
+
+
+format_functions_rec = lambda k: functools.partial(rec_fn, fn=format_functions[k])
+
+mod_default_dict = {
+    "Matmul": ModuleInfo("matmul", PatchedMatmul),
+    "Linear": ModuleInfo("linear", PatchedLinear),
+    "RowParallelLinear": ModuleInfo("linear", PatchedRowParallelLinear),
+    "ColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
+    "MergedColumnParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
+    "QKVParallelLinear": ModuleInfo("linear", PatchedColumnParallelLinear),
+    "FalconLinear": ModuleInfo("linear", PatchedLinear),
+    "KVCache": ModuleInfo("kv_cache", PatchedKVCache),
+    "VLLMKVCache": ModuleInfo("kv_cache", PatchedVLLMKVCache),
+    "Conv2d": ModuleInfo("linear", PatchedConv2d),
+    "LoRACompatibleLinear": ModuleInfo("linear", PatchedLoRACompatibleLinear),
+    "LoRACompatibleConv": ModuleInfo("linear", PatchedLoRACompatibleConv),
+    "Softmax": ModuleInfo("softmax", PatchedSoftmax),
+    "ModuleFusedSDPA": ModuleInfo("fused_sdpa", PatchedModuleFusedSDPA),
+}
+
+
+if deepspeed_exists:
+    mod_default_dict.update(
+        {
+            "LinearLayer": ModuleInfo("linear", PatchedLinear),
+            "LinearAllreduce": ModuleInfo("linear", PatchedLinearAllReduce),
+            "ScopedLinearAllReduce": ModuleInfo("linear", PatchedLinearAllReduce),
+            "LmHeadLinearAllreduce": ModuleInfo("linear", PatchedLmHeadLinearAllreduce),
+        }
+    )
+
+
+class ModInstInfo:
+    def __init__(self, name, parent):
+        self.name = name
+        self.parent = parent
+
+
+parent_child_mod_dict = {}
+
+
+def generate_model_info(model):
+    def create_mod_info_recursion(parent):
+        for name, mod in parent.named_children():
+            parent_child_mod_dict[mod] = ModInstInfo(name, parent)
+            create_mod_info_recursion(mod)
+
+    create_mod_info_recursion(model)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
new file mode 100644
index 00000000000..14f54d4eaa8
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
@@ -0,0 +1,172 @@
+import torch
+import habana_frameworks.torch.core as htcore
+import habana_frameworks.torch.utils.experimental as htexp
+from .common import *
+
+GAUDI2 = htexp.synDeviceType.synDeviceGaudi2
+GAUDI3 = htexp.synDeviceType.synDeviceGaudi3
+
+EXP_WIDTH = {
+    torch.float32: 8,
+    torch.bfloat16: 8,
+    torch.float8_e4m3fn: 4,
+    torch.float8_e5m2: 5,
+}
+
+
+def get_default_exp_bias(dtype):
+    exp_width = EXP_WIDTH[dtype]
+    return 2 ** (exp_width - 1) - 1
+
+
+EXP_BIAS_SETS = {
+    (GAUDI2, torch.float8_e4m3fn): [3, 7, 11, 15],
+    (GAUDI2, torch.float8_e5m2): [15],
+    (GAUDI3, torch.float8_e4m3fn): range(0, 63),
+    (GAUDI3, torch.float8_e5m2): range(0, 63),
+}
+
+MAX_RANGE = {
+    torch.float32: 2 ** ((2**8 - 2 - get_default_exp_bias(torch.float32))) * (2 - 2 ** -(23)),
+    torch.bfloat16: 2 ** ((2**8 - 2 - get_default_exp_bias(torch.bfloat16))) * (2 - 2 ** -(7)),
+    torch.float8_e4m3fn: 2 ** ((2**4 - 2 - get_default_exp_bias(torch.float8_e4m3fn))) * (2 - 2 ** -(8 - 1 - 4)),
+    torch.float8_e5m2: 2 ** ((2**5 - 2 - get_default_exp_bias(torch.float8_e5m2))) * (2 - 2 ** -(8 - 1 - 5)),
+}
+
+
+def get_fullscale(dtype, exp_bias=None):
+    default_exp_bias = get_default_exp_bias(dtype)
+    fullscale = MAX_RANGE[dtype]
+    exp_bias = default_exp_bias if exp_bias == None else exp_bias
+    fullscale = fullscale * (2 ** (default_exp_bias - exp_bias))
+    return fullscale
+
+
+def get_fullscales_by_expbias_set(dtype, expbias_set):
+    return [get_fullscale(dtype, exp_bias=eb) for eb in expbias_set]
+
+
+def get_fp8_hw_alligned_scales(dtype, device):
+    exp_bias_set = EXP_BIAS_SETS.get((device, dtype), None)
+    return (
+        None
+        if exp_bias_set == None
+        else [x / MAX_RANGE[dtype] for x in get_fullscales_by_expbias_set(dtype, exp_bias_set)]
+    )
+
+
+DEVICES_SCALE_FACTORS = {
+    htexp.synDeviceType.synDeviceGaudi2: 4,
+    htexp.synDeviceType.synDeviceGaudi3: 1,
+}
+FP8_143_SCALES = {
+    device: get_fp8_hw_alligned_scales(torch.float8_e4m3fn, device) for device in DEVICES_SCALE_FACTORS.keys()
+}
+FP8_143_SCALES_TRAITS = {
+    device: (
+        min(FP8_143_SCALES[device]),
+        max(FP8_143_SCALES[device]),
+        DEVICES_SCALE_FACTORS[device],
+    )
+    for device in DEVICES_SCALE_FACTORS.keys()
+}
+
+
+def calc_maxabs_scale(xmaxabs, fullscale, backoff=1):
+    scale = xmaxabs / (fullscale * backoff)
+    return scale
+
+
+def scale_to_pow2(scale):
+    scale_pow2 = 2 ** torch.ceil(torch.log2(scale))
+    return scale_pow2
+
+
+# Considering range of hw alligned scales: 2^a, 2^a+1,..., 2^b (a<b)
+# we want to choose scale s for maxabs m such that 2^a <= s=2^x <= 2^b (for integer a<=x<=b)
+# and also 2^(x-1) < m <= 2^x
+# if m>=2^b then s=2^b, therefor min(_, 2^b)
+# if m<=2^a then s=2^a, therefor max(_, 2^a) --> 2^a <= min(max(_,2^a),2^b) <=2^b
+# if s^a<m<2^b then m as a positive number can be written as m=2^y (y=log2(m))
+# if y is integer then y=ciel(y) we choose x=y so s=2^x=2^y=2^ciel(y)=2^ciel(log2(m))
+# else we choose x=ciel(y) and a<=x-1<y<x<=b and s=2^x=2^ciel(y)=2^ciel(log2(m))
+# for Gaudi2 the range is 16^-2..16^1 so we change 2 with 16 and remember that:
+# 16 = 2^4, log16(m)=log2(m)/log2(16)=log2(m)/4, and we get:
+# we choose s=16^ciel(log16(m))=2^4^ciel(log2(m)/4)=2^(4*ciel(log2(m)/4))=2^(ciel(log2(m)/4)*4)
+def scale_to_pow2_hw(scale, device_type):
+    scale_pow2 = scale_to_pow2(scale)
+    min_scale, max_scale, scale_factor = FP8_143_SCALES_TRAITS[device_type]
+    scale_pow2_hw = torch.minimum(
+        torch.maximum(
+            2 ** (torch.ceil(torch.log2(scale_pow2) / scale_factor) * scale_factor),
+            torch.tensor(min_scale, dtype=scale.dtype, device=scale.device),
+        ),
+        torch.tensor(max_scale, dtype=scale.dtype, device=scale.device),
+    )
+    return scale_pow2_hw
+
+
+def mmse_scale_multi(x, ref_scale, scales, lp_dtype, hp_dtype):
+    # TODO: SW-176672 move weights to hpu before the scale calculations
+    x = x.to("hpu")
+    Nch = x.shape[-1]
+    opt_err = torch.ones(Nch, dtype=hp_dtype, device=x.device) * torch.inf
+    opt_scale = torch.ones(Nch, dtype=hp_dtype, device=x.device) * -1
+    sum_axis = list(range(x.ndim - 1))
+    rs = ref_scale.unsqueeze(dim=1).transpose(0, 1)
+    for s in scales:
+        sv = torch.ones(Nch, dtype=hp_dtype, device=x.device) * s
+        xscales = rs * sv
+        y = scale_fcn(x, xscales)
+        y = cast_to_fp8_fcn(y, lp_dtype)
+        htcore.mark_step()  # we are measuring the error so we want to avoid fusion of the converts
+        y = cast_fcn(y, hp_dtype)
+        y = descale_fcn(y, xscales)
+        err = torch.sum((x - y) ** 2, dim=sum_axis)
+        opt_scale = torch.where(err < opt_err, sv, opt_scale)
+        opt_err = torch.where(err < opt_err, err, opt_err)
+        htcore.mark_step()
+    return opt_scale * ref_scale
+
+
+def mmse_scale(x, scales, lp_dtype, hp_dtype):
+    # TODO: SW-176672 move weights to hpu before the scale calculations
+    x = x.to("hpu")
+    opt_err = torch.ones(1, dtype=hp_dtype, device=x.device) * torch.inf
+    opt_scale = -1
+    for s in scales:
+        y = scale_fcn(x, s)
+        y = cast_to_fp8_fcn(y, lp_dtype)
+        htcore.mark_step()  # we are measuring the error so we want to avoid fusion of the converts
+        y = cast_fcn(y, hp_dtype)
+        y = descale_fcn(y, s)
+        err = torch.norm(x - y)
+        opt_scale = torch.where(err <= opt_err, s, opt_scale)
+        opt_err = torch.where(err <= opt_err, err, opt_err)
+        htcore.mark_step()
+    return opt_scale
+
+
+def manipulate_scales(scales, func):
+    new_inputs = [func(input) for input in scales.inputs]
+    new_outputs = [func(output) for output in scales.outputs]
+    new_weights = {}
+    if "weight" in scales.params.keys():
+        if isinstance(scales.params["weight"], (torch.Tensor, float)):
+            new_weights = {"weight": func(scales.params["weight"])}
+        elif isinstance(scales.params["weight"], dict):
+            new_weights_dict = {}
+            for key, wt in scales.params["weight"].items():
+                new_weights_dict[key] = func(wt)
+            new_weights = {"weight": new_weights_dict}
+    new_scales = ModuleConfig(new_inputs, new_outputs, new_weights)
+    return new_scales
+
+
+def invert_scales(scales):
+    def invert(x):
+        if isinstance(x, (list, tuple)):
+            return [1 / x_i for x_i in x]
+        return 1 / x
+
+    return manipulate_scales(scales, invert)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
new file mode 100644
index 00000000000..39e0e74f666
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -0,0 +1,419 @@
+import json
+import os
+import torch
+import numpy as np
+import habana_frameworks.torch.core as htcore
+
+from .._quant_common.quant_config import (
+    QuantMode,
+    ScaleMethod,
+    MeasureExclude,
+    get_hqt_config,
+)
+from .common import *
+from ..utils.logger import logger
+
+imod_dict = {}
+gmod_list = []
+
+
+def patch_module_measure(mod, mconfig, mod_dict):
+    parent = parent_child_mod_dict[mod].parent
+    name = parent_child_mod_dict[mod].name
+    patched_mod = mod_dict[mod.__class__.__name__].patched_module(mod, mconfig, name)
+    setattr(parent, name, patched_mod)
+    return patched_mod
+
+
+def init_measure_object(mod, name, observer_class, mod_type, skip_measure_output, d_shape=None, params=None):
+    input_observer = [
+        observer_class(
+            "input%d" % (i),
+            mod,
+            None if d_shape is None else d_shape.inputs[i],
+            params=None if params is None else params.inputs[i],
+        )
+        for i in range(mod_type.num_inputs)
+    ]
+    if (not mod_type.required_output) and skip_measure_output:
+        # excluding output measurements
+        output_observer = None
+    else:
+        output_observer = [
+            observer_class(
+                "output%d" % (i),
+                mod,
+                None if d_shape is None else d_shape.outputs,
+                params=None if params is None else params.outputs,
+            )
+            for i in range(mod_type.num_outputs)
+        ]
+    params_observer = {
+        param_name: observer_class(
+            param_name,
+            mod,
+            None if d_shape is None else d_shape.params[param_name],
+            params=None if params is None else params.params[param_name],
+        )
+        for param_name in mod_type.param_names
+    }
+    measure_object = ModuleExtraConfig(input_observer, output_observer, params_observer, None, params)
+    return measure_object
+
+
+def prepare_model(model, mod_list=None):
+    config = get_hqt_config(model).cfg
+    observer_class = observer_types[config["observer"]]
+    if (config["shape_file"] is not None) and (observer_class != ShapeObserver):
+        shapes_fname = config["shape_file"] + ".json"
+        d_shapes = load_file(shapes_fname, ShapeList, False)
+    else:
+        d_shapes = None
+    gmod_list.extend(mod_list)
+    generate_model_info(model)
+    register_patched_measure_modules(model, mod_list, observer_class, d_shapes)
+
+
+def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=None):
+    top_level_config = get_hqt_config(model)
+    config = top_level_config.cfg
+    skip_outputs_measurements = config["measure_exclude"] & (MeasureExclude.OUTPUT | MeasureExclude.ALL)
+    patched_types = set()
+    non_patched_types = set()
+    patched_modules = []
+    with torch.no_grad():
+        for name, mod in model.named_modules():
+            if (name in mod_list) or (mod_list is None):
+                imod_dict[mod] = name
+                mod_type_str = mod.__class__.__name__
+                mod_type = config["mod_dict"][mod_type_str]
+                params = (
+                    observer_params[config["observer"]][mod_type]
+                    if (config["observer"] in observer_params) and (mod_type in observer_params[config["observer"]])
+                    else None
+                )
+                patched_types.add(type(mod))
+
+                mod_extra_config = init_measure_object(
+                    mod,
+                    name,
+                    observer_class,
+                    mod_types[mod_type],
+                    skip_outputs_measurements,
+                    (d_shapes[name] if ((d_shapes is not None) and (name in d_shapes)) else None),
+                    params,
+                )
+                set_hqt_config(mod, top_level_config)  # set config in the module, as it consumed by the patched module
+                pmod = patch_module_measure(mod, mod_extra_config, mod_default_dict)
+                for param_name in pmod._mod_extra_config.params:
+                    param = getattr(pmod, param_name)
+                    param = param.to("hpu")
+                    pmod._mod_extra_config.params[param_name].measure(param)
+                    htcore.mark_step()
+                if observer_class == SaveObserver:
+                    save_module(pmod)
+                patched_modules.append(name)
+            else:
+                non_patched_types.add(type(mod))
+    logger.debug("Patched module types: %s", patched_types)
+    logger.debug("None-patched module types: %s", non_patched_types)
+    logger.debug("Patched modules: %s", patched_modules)
+    logger.debug("Total patched modules: %d", len(patched_modules))
+    model = model.to("hpu")
+    htcore.mark_step()
+
+
+def is_measure_done(mod_extra_config):
+    # check if measurements were collected by observer
+    for obs in ([] if mod_extra_config.inputs is None else mod_extra_config.inputs) + (
+        [] if mod_extra_config.outputs is None else mod_extra_config.outputs
+    ):
+        if obs.is_used():
+            return True
+    return False
+
+
+def get_mod_extra_config_dict(model):
+    mcd = {}
+    for name, mod in model.named_modules():
+        if hasattr(mod, "_mod_extra_config"):
+            if is_measure_done(mod._mod_extra_config):
+                name = name.replace("_orig_mod.", "") # remove _orig_mod part added by dynamo mechanism
+                mcd[name] = mod._mod_extra_config
+            else:
+                logger.debug(
+                    "Layer '%s' has no measurements therefore it can't be quantized during quantization.",
+                    name,
+                )
+    return mcd
+
+
+def measure_control_to_state_dict(mcd):
+    sd = {}
+    sdl = {}
+    for mname in mcd:
+        sd[mname] = dict()
+        sdl[mname] = dict()
+        sd[mname]["inputs"] = [
+            mcd[mname].inputs[i].state.detach().cpu().float().numpy()
+            for i in range(len(mcd[mname].inputs))
+            if mcd[mname].inputs[i].state is not None
+        ]
+        sdl[mname]["inputs"] = [
+            mcd[mname].inputs[i].state.detach().cpu().float().numpy().tolist()
+            for i in range(len(mcd[mname].inputs))
+            if mcd[mname].inputs[i].state is not None
+        ]
+        if mcd[mname].outputs:
+            sd[mname]["outputs"] = [
+                mcd[mname].outputs[i].state.detach().cpu().float().numpy()
+                for i in range(len(mcd[mname].outputs))
+                if mcd[mname].outputs[i].state is not None
+            ]
+            sdl[mname]["outputs"] = [
+                mcd[mname].outputs[i].state.detach().cpu().float().numpy().tolist()
+                for i in range(len(mcd[mname].outputs))
+                if mcd[mname].outputs[i].state is not None
+            ]
+        if len(mcd[mname].params) > 0:
+            sd[mname]["params"] = dict()
+            sdl[mname]["params"] = dict()
+            for param_name in mcd[mname].params:
+                if mcd[mname].params[param_name].state is not None:
+                    sd[mname]["params"][param_name] = (
+                        mcd[mname].params[param_name].state.detach().cpu().float().numpy()
+                    )
+                    sdl[mname]["params"][param_name] = (
+                        mcd[mname].params[param_name].state.detach().cpu().float().numpy().tolist()
+                    )
+    return sd, sdl
+
+
+def save_measurements(model, fname=None):
+    config = get_hqt_config(model).cfg
+    if config["mode"] in [QuantMode.MEASURE, QuantMode.SHAPE]:
+        if fname is None:
+            if ("measure_file" in config) and (config["measure_file"] is not None):
+                fname_base = config["measure_file"]
+                measure_type = "DynamicRange"
+            elif ("shape_file" in config) and (config["shape_file"] is not None) and (config["observer"] == "shape"):
+                fname_base = config["shape_file"]
+                measure_type = "Shape"
+            fname_np = fname_base + ".npz"
+            fname_list = fname_base + ".json"
+        else:
+            logger.warning("'fname' is not None - Measurements/Shapes will not be saved")
+            return
+        mcd = get_mod_extra_config_dict(model)
+        sd, sdl = measure_control_to_state_dict(mcd)
+
+        logger.info("Dumping measurements")
+        save_file(model, sd, np.ndarray, fname_np, measure_type)
+        save_file(model, sdl, list, fname_list, measure_type)
+        save_json(gmod_list, fname_base + "_mod_list.json")
+
+
+def load_measurements(model, fname):
+    config = get_hqt_config(model).cfg
+    source_fname = fname if fname is not None else config["measure_file"]
+    fname_np = source_fname + ".npz"
+    d = load_file(
+        fname_np,
+        np.ndarray,
+        fail_on_file_not_exist=(config["scale_method"] != ScaleMethod.UNIT_SCALE),
+    )
+    from collections import defaultdict
+
+    d = defaultdict(lambda: None, d)
+
+    return d
+
+
+def get_default_config(mod_list):
+    config = {k: "default" for k in mod_list}
+    return config
+
+
+def save_json(d, fname):
+    with open(fname, "w") as f:
+        json.dump(d, f, indent=4)
+
+
+def load_json(fname):
+    with open(fname, "r") as f:
+        d = json.load(f)
+    return d
+
+
+class MaxAbsObserver:
+    def __init__(self, name, mod, d_shape=None, params=None):
+        self.name = name
+        self.mod = mod
+        self.first = True
+        self.used = False
+        self.state = self.init_state_from_shape(d_shape)
+
+    def init_state(self, x):
+        device = x.device
+        state = torch.zeros((1, 1), device=device, dtype=torch.float32)
+        self.shape = list(x.shape)
+        return state
+
+    def init_state_from_shape(self, x_shape, device="hpu"):
+        state = torch.zeros((1, 1), device=device, dtype=torch.float32)
+        self.first = False
+        return state
+
+    def update_state(self, x):
+        # TODO: [SW-189690] Find better way to update self.state in MaxAbsObserver class in HQT
+        self.state = torch.maximum(torch.max(torch.abs(x)), self.state)
+
+    def measure(self, x):
+        if self.first:
+            self.state = self.init_state(x)
+            self.first = False
+        self.update_state(x)
+        self.used = True
+
+    def is_used(self):
+        return self.used
+
+
+class MaxAbsPerChannelObserver:
+    def __init__(self, name, mod, d_shape=None, params=None):
+        self.name = name
+        self.mod = mod
+        self.first = True
+        self.state = None
+        self.used = False
+        self.dim = params["dim"] if (params is not None) and ("dim" in params) else -1
+        if d_shape is not None:
+            p = list(range(len(d_shape)))
+            self.dim = self.dim if self.dim >= 0 else len(d_shape) + self.dim
+            p[-1] = self.dim
+            p[self.dim] = len(d_shape) - 1
+            self.p = p
+            self.state = self.init_state_from_shape(d_shape)
+
+    def init_state(self, x):
+        device = x.device
+        Nch = x.shape[self.dim]
+        self.Nch = Nch
+        state = torch.zeros((Nch, 1), device=device, dtype=torch.float32)
+        self.shape = list(x.shape)
+        return state
+
+    def init_state_from_shape(self, x_shape, device="hpu"):
+        device = device
+        Nch = x_shape[self.dim]
+        self.Nch = Nch
+        state = torch.zeros((Nch, 1), device=device, dtype=torch.float32)
+        self.first = False
+        return state
+
+    def update_state(self, x):
+        self.state.copy_(
+            torch.maximum(
+                torch.max(
+                    torch.abs(x.permute(self.p).reshape([-1, self.Nch])),
+                    dim=0,
+                    keepdim=True,
+                )[0].t(),
+                self.state,
+            )
+        )
+
+    def measure(self, x):
+        if self.first:
+            self.state = self.init_state(x)
+            self.first = False
+        self.update_state(x)
+        self.used = True
+
+    def is_used(self):
+        return self.used
+
+
+def save_module(mod):
+    folder_name = os.path.join(mod.config["dump_stats_base_path"], "tensors")
+    os.makedirs(folder_name, exist_ok=True)
+    file_base_name = os.path.join(folder_name, imod_dict[mod] + "_module.pt")
+    torch.save(mod.state_dict(), file_base_name)
+
+
+class SaveObserver:
+    def __init__(self, name, mod, d_shape=None, params=None):
+        self.name = name
+        self.mod = mod
+        self.first = True
+        self.cnt = -1
+        self.folder_name = os.path.join(config["dump_stats_base_path"], "tensors")
+        os.makedirs(self.folder_name, exist_ok=True)
+        self.file_base_name = os.path.join(self.folder_name, imod_dict[mod] + "_" + name + "_iter")
+        self.state = self.init_state_from_shape(d_shape)
+        self.used = False
+
+    def init_state(self, x):
+        device = x.device
+        state = torch.zeros((1, 1), device=device, dtype=torch.float32)
+        self.shape = list(x.shape)
+        return state
+
+    def init_state_from_shape(self, x_shape, device="hpu"):
+        state = torch.zeros((1, 1), device=device, dtype=torch.float32)
+        self.first = False
+        return state
+
+    def update_state(self, x):
+        self.cnt += 1
+        torch.save(x, self.file_base_name + str(self.cnt) + ".pt")
+
+    def measure(self, x):
+        self.update_state(x)
+        self.used = True
+
+    def is_used(self):
+        return self.used
+
+
+class ShapeObserver:
+    def __init__(self, name, mod, d_shape=None, params=None):
+        self.name = name
+        self.mod = mod
+        self.state = None
+
+    def init_state(self, x):
+        device = x.device
+        Ndim = len(x.shape)
+        self.Ndim = Ndim
+        state = torch.tensor(x.shape, device=device, dtype=torch.int32).reshape((1, Ndim))
+        return state
+
+    def init_state_from_shape(self, x_shape, device="hpu"):
+        logger.info("ShapeObserver doesn't support init_state_from_shape")
+        return
+
+    def update_state(self, x):
+        logger.info("ShapeObserver doesn't support update_state")
+        return
+
+    def measure(self, x):
+        self.state = self.init_state(x)
+
+    def is_used(self):
+        return self.state is not None
+
+
+observer_types = {
+    "shape": ShapeObserver,
+    "maxabs": MaxAbsObserver,
+    "maxabs_per_channel": MaxAbsPerChannelObserver,
+    "save": SaveObserver,
+}
+
+observer_params = {
+    "maxabs_per_channel": {
+        "linear": ModuleConfig(({"dim": -1},), ({"dim": -1},), {"weight": {"dim": 0}}),
+        "matmul": ModuleConfig(({"dim": -1}, {"dim": -2},), ({"dim": -1},), None),
+    }
+}
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
new file mode 100644
index 00000000000..50b604b7d89
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
@@ -0,0 +1,55 @@
+import torch.nn as nn
+from abc import abstractmethod
+from .common import *
+
+
+class QuantDequantBase(nn.Module):
+    def __init__(self, lp_dtype, hp_dtype="", *args, **kwargs):
+        super(QuantDequantBase, self).__init__(*args, **kwargs)
+        self.lp_dtype = lp_dtype
+        self.hp_dtype = hp_dtype
+
+    @abstractmethod
+    def forward(self, *args, **kwargs):
+        pass
+
+    def extra_repr(self) -> str:
+        return f"lp_dtype={self.lp_dtype}, hp_dtype={self.hp_dtype}"
+
+
+class QuantDequantNone(QuantDequantBase):
+    def __init__(self, lp_dtype, hp_dtype, *args, **kwargs):
+        super(QuantDequantNone, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        return args[0]
+
+    def extra_repr(self) -> str:
+        repr = super(QuantDequantNone, self).extra_repr()
+        return f"{repr}, doesn't quantize nor dequantize"
+
+
+class QuantInput(QuantDequantBase):
+    def __init__(self, scale_inv, lp_dtype, hp_dtype, *args, **kwargs):
+        super(QuantInput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        self.scale_inv = nn.Parameter(scale_inv)
+
+    def forward(self, x):
+        return cast_to_fp8_fcn(x, self.lp_dtype, self.scale_inv)
+
+    def extra_repr(self) -> str:
+        repr = super(QuantInput, self).extra_repr()
+        return f"{repr}, scale_inv dtype={self.scale_inv.dtype}"
+
+
+class DequantOutput(QuantDequantBase):
+    def __init__(self, scale, lp_dtype, hp_dtype, *args, **kwargs):
+        super(DequantOutput, self).__init__(lp_dtype, hp_dtype, *args, **kwargs)
+        self.scale = nn.Parameter(scale)
+
+    def forward(self, x):
+        return cast_from_fp8_fcn(x, self.hp_dtype, self.scale)
+
+    def extra_repr(self) -> str:
+        repr = super(DequantOutput, self).extra_repr()
+        return f"{repr}, scale dtype={self.scale.dtype}"
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
new file mode 100644
index 00000000000..76ee0a1d635
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -0,0 +1,96 @@
+import torch
+import torch.nn as nn
+import habana_frameworks.torch.core as htcore
+from .._quant_common.quant_config import get_hqt_config
+from .._quant_common.helper_modules import PatchedUnmeasuredModule
+from .measure import load_measurements
+from .scale import scale_method_mapping, get_config, scaling_methods
+from .common import (
+    mod_default_dict,
+    generate_model_info,
+    parent_child_mod_dict,
+    UNMEASURED_MODELS,
+)
+from ..utils.logger import logger
+
+
+def patch_module(mod, qconfig, mod_dict, patched_mod=None):
+    parent = parent_child_mod_dict[mod].parent
+    name = parent_child_mod_dict[mod].name
+    if patched_mod is None:
+        patched_mod = mod_dict[mod.__class__.__name__].patched_module(mod, qconfig)
+    setattr(parent, name, patched_mod)
+
+
+def apply_hf_hook(module):
+    if hasattr(module, "_hf_hook"):
+        module._hf_hook.pre_forward(module)
+        module._hf_hook.detach_hook(module)
+        delattr(module, "_hf_hook")
+    if hasattr(module, "_old_forward"):
+        module.forward = module._old_forward
+        delattr(module, "_old_forward")
+
+
+def quantize_params(mod, mod_extra_config):
+    for param_name in mod_extra_config.params:
+        quantizer = mod_extra_config.params[param_name]
+        param = getattr(mod, param_name)
+        quantized_param = quantizer(param.to("hpu"))
+        delattr(mod, param_name)
+        setattr(mod, param_name, nn.Parameter(quantized_param))
+        quantized_param = getattr(mod, param_name)
+        quantized_param.requires_grad_(False)
+        htcore.mark_step()
+
+
+def prepare_model(model, qconfig, mod_list, hp_dtype=torch.float):
+    config = get_hqt_config(model)
+    patched_modules = []
+    patched_module_types = set()
+    with torch.no_grad():
+        for name, mod in model.named_modules():
+            if name in qconfig[UNMEASURED_MODELS]:
+                if not config.cfg["ignore_modules_wo_measures"]:
+                    patch_module(mod, None, None, PatchedUnmeasuredModule(name))
+                else:
+                    logger.debug("Module %s was not quantized.", name)
+                continue
+            # When offloading weight to disk, need to transfer the weight from disk to cpu using hf_hook
+            apply_hf_hook(mod)
+            if name in mod_list:
+                mod_extra_config = qconfig[name]
+                quantize_params(mod, mod_extra_config)
+                patch_module(mod, mod_extra_config, mod_default_dict)
+                patched_modules.append(name)
+                patched_module_types.add(type(mod))
+    logger.debug("Patched module types: %s", patched_module_types)
+    logger.debug("Patched modules: %s", patched_modules)
+    logger.debug("Total patched modules: %d", len(patched_modules))
+    model = model.to("hpu")
+    htcore.mark_step()
+
+
+def quantize(model, mod_list):
+    config = get_hqt_config(model)
+    generate_model_info(model)
+    hp_dtype = config.cfg["hp_dtype"]
+    lp_dtype = config.cfg["fp8_config"]
+    measurement = load_measurements(model, config.cfg["measure_file"])
+    # FIXME make sure this takes unit_scale or measured scale, from Configs
+    scaling_method_name = scale_method_mapping[(config.cfg["scale_method"], config.cfg["observer"])]
+    scaling_method = scaling_methods[scaling_method_name]
+    params = config.cfg["scale_params"]
+    params["hp_dtype"] = hp_dtype
+    params["lp_dtype"] = lp_dtype
+    qconfig = get_config(
+        model,
+        measurement,
+        mod_default_dict,
+        scaling_method,
+        params,
+        config.cfg["scale_file"],
+        False,
+        mod_list,
+    )
+    prepare_model(model, qconfig, mod_list, hp_dtype=hp_dtype)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
new file mode 100644
index 00000000000..a85c79b660b
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
@@ -0,0 +1,438 @@
+import torch
+import numpy as np
+
+from .._quant_common.quant_config import ScaleMethod
+from .scale_methods import *
+from .quant_dequant import *
+
+from .fp_utils import *
+from .common import *
+from ..utils.logger import logger
+
+
+def matmul_scales_to_mod_config(mod, scales, params):
+    scales_inv = invert_scales(scales)
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    input_config = [QuantInput(s_inv, lp_dtype, hp_dtype) for s_inv in scales_inv.inputs]
+    # outputs as bf16, and descaled in gemm under PatchedMatmul, so no need to work here
+    output_config = [QuantDequantNone(lp_dtype, hp_dtype)]
+    config = ModuleConfig(input_config, output_config, {})
+    return config
+
+
+def fsdpa_scales_to_mod_config(mod, scales, params):
+    scales_inv = invert_scales(scales)
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    input_config = [QuantInput(s_inv, lp_dtype, hp_dtype) for s_inv in scales_inv.inputs]
+    output_config = [DequantOutput(scales.outputs[0], lp_dtype, hp_dtype)]
+    config = ModuleConfig(input_config, output_config, {})
+    return config
+
+
+def linear_scales_to_mod_config(mod, scales, params):
+    scales_inv = invert_scales(scales)
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    input_config = [QuantInput(scales_inv.inputs[0], lp_dtype, hp_dtype)]
+    # outputs as bf16, and descaled in gemm under PatchedLinear, so no need to work here
+    output_config = [QuantDequantNone(lp_dtype, hp_dtype)]
+    if isinstance(scales_inv.params["weight"], (torch.Tensor, float)):
+        weight_config = QuantInput(scales_inv.params["weight"], lp_dtype, hp_dtype)
+    elif isinstance(scales_inv.params["weight"], dict):
+        weight_scale_inv_out_ch = scales_inv.params["weight"][0]
+        weight_scale_inv_in_ch = scales_inv.params["weight"][1]
+        if isinstance(weight_scale_inv_out_ch, torch.Tensor):
+            scale_inv = torch.mul(
+                weight_scale_inv_in_ch.reshape([1, -1]),
+                weight_scale_inv_out_ch.reshape([-1, 1]),
+            )
+        else:
+            # TODO SW-169781: Handle here scalar weight for PCQ
+            raise TypeError(f"Unknown weight scales type: {type(weight_scale_inv_out_ch)}.")
+        weight_config = QuantInput(scale_inv, lp_dtype, hp_dtype)
+    else:
+        logger.error("Unknown weight scales format.")
+    params_config = {"weight": weight_config}
+    if hasattr(mod, "bias") and (getattr(mod, "bias") is not None):
+        # In PatchedLinear the bias is added to the output of gemm.
+        # The output is expected to be descaled and in bf16, so we don't need to touch the bias.
+        bias_config = QuantDequantNone(lp_dtype, hp_dtype)
+        params_config.update({"bias": bias_config})
+    config = ModuleConfig(input_config, output_config, params_config)
+    return config
+
+
+def kv_cache_scales_to_mod_config(mod, scales, params):
+    # how quant/dequant will be applied on layer tensors
+    scales_inv = invert_scales(scales)
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    input_config = [QuantInput(scales_inv.inputs[0], lp_dtype, hp_dtype)]
+    output_config = [DequantOutput(scales.outputs[0], lp_dtype, hp_dtype)]
+    config = ModuleConfig(input_config, output_config)
+    return config
+
+
+def softmax_scales_to_mod_config(mod, scales, params):
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    output_config = [DequantOutput(scales.outputs[0], lp_dtype, hp_dtype)]
+    return ModuleConfig(None, output_config)
+
+
+def get_config(
+    model,
+    measurement,
+    mod_dict,
+    method,
+    params,
+    scales_file=None,
+    recalc_scales=False,
+    mod_list=None,
+):
+    with torch.no_grad():
+        top_level_config = get_hqt_config(model)
+        qconfig = {UNMEASURED_MODELS: []}
+        scales_file_format = np.ndarray  # file_functions[os.path.splitext(scales_file)[1]][0]
+        scales_obj = (
+            load_scales(scales_file + ".npz", scales_file_format)
+            if (scales_file is not None) and not recalc_scales
+            else {}
+        )
+        scales = convert_scales_to_tensors_dict(scales_obj, scales_file_format, params["hp_dtype"])
+        model_dict = dict(model.named_modules())
+        for mname in mod_list:
+            mod = model_dict[mname]
+            set_hqt_config(mod, top_level_config)  # set config in the module, as it consumed by the patched module
+            mod_type_str = mod.__class__.__name__
+            layer_type = mod_dict[mod_type_str].type
+            if mname not in scales:
+                logger.debug("Calcuating scales for layer %s", mname)
+                if mname not in measurement:
+                    qconfig[UNMEASURED_MODELS].append(mname)
+                    logger.debug(
+                        "Layer '%s' has no measurements therefore it can't be quantized.",
+                        mname,
+                    )
+                    continue
+                layer_measure = measurement[mname]  # ModuleConfig() of measurements
+                scales[mname] = method[layer_type][0](mod, layer_measure, params)   # ModuleConfig() of scales
+                if scales_file is not None:
+                    scales_obj[mname] = ModuleConfig(
+                        **format_functions_rec((torch.Tensor, scales_file_format))(scales[mname].__dict__)
+                    )
+
+            logger.debug(
+                "Preparing quantization functions for layer %s layer_type=%s",
+                mname,
+                layer_type,
+            )
+            mod_config = method[layer_type][1](mod, scales[mname], params)  # ModuleConfig() of QuantDequant
+            mod_extra_config = ModuleExtraConfig(
+                mod_config.inputs,
+                mod_config.outputs,
+                mod_config.params,
+                scales[mname],
+                params,
+            )
+            qconfig[mname] = mod_extra_config
+        if scales_file is not None:
+            save_scales(model, scales_obj, scales_file_format, scales_file + ".npz")
+            save_scales(model, scales_obj, scales_file_format, scales_file + ".json")
+    return qconfig
+
+
+scaling_methods = {
+    "unit_scale": {
+        "linear": (linear_unit_scale_scales, linear_scales_to_mod_config),
+        "matmul": (matmul_unit_scale_scales, matmul_scales_to_mod_config),
+        "softmax": (softmax_unit_scale_scales, softmax_scales_to_mod_config),
+        "kv_cache": (kv_cache_unit_scale_scales, kv_cache_scales_to_mod_config),
+        "fused_sdpa": (fsdpa_unit_scale_scales, fsdpa_scales_to_mod_config),
+    },
+    "act_maxabs_pts_weight_maxabs_pts_pow2_hw": {
+        "linear": (
+            linear_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            matmul_scales_to_mod_config,
+        ),
+        "kv_cache": (
+            kv_cache_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            kv_cache_scales_to_mod_config,
+        ),
+        "softmax": (
+            softmax_input_unit_output_maxabs_pts_hw_scales,
+            softmax_scales_to_mod_config,
+        ),
+        "fused_sdpa": (
+            fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            fsdpa_scales_to_mod_config,
+        ),
+    },
+    "act_maxabs_pts_weight_maxabs_pts_pow2": {
+        "linear": (
+            linear_act_maxabs_pts_weight_maxabs_pts_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_scales,
+            matmul_scales_to_mod_config,
+        ),
+    },
+    "act_maxabs_pts_pow2_hw_weights_maxabs_pcs_pow2": {
+        "linear": (
+            linear_act_maxabs_pts_pow2_hw_weights_maxabs_pcs_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            matmul_scales_to_mod_config,
+        ),
+        # kv_cache is pts as op in hw doesn't work in pcs
+        "kv_cache": (
+            kv_cache_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            kv_cache_scales_to_mod_config,
+        ),
+        "fused_sdpa": (
+            fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            fsdpa_scales_to_mod_config,
+        ),
+    },
+    "act_maxabs_pts_weight_opt_pts_pow2": {
+        "linear": (
+            linear_act_maxabs_pts_weight_opt_pts_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_scales,
+            matmul_scales_to_mod_config,
+        ),
+    },
+    "act_maxabs_pts_weight_opt_pts_hw": {
+        "linear": (
+            linear_act_maxabs_pts_weight_opt_pts_hw_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            matmul_scales_to_mod_config,
+        ),
+        "softmax": (
+            softmax_input_unit_output_maxabs_pts_hw_scales,
+            softmax_scales_to_mod_config,
+        ),
+        "fused_sdpa": (
+            fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            fsdpa_scales_to_mod_config,
+        ),
+    },
+    "act_maxabs_pts_pow2_hw_weights_opt_pcs_pow2": {
+        "linear": (
+            linear_act_maxabs_pts_pow2_hw_weights_opt_pcs_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            matmul_scales_to_mod_config,
+        ),
+        # kv_cache is pts as op in hw doesn't work in pcs
+        "kv_cache": (
+            kv_cache_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            kv_cache_scales_to_mod_config,
+        ),
+        "fused_sdpa": (
+            fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            fsdpa_scales_to_mod_config,
+        ),
+    },
+    "act_maxabs_pts_pow2_weights_maxabs_pcs_pow2": {
+        "linear": (
+            linear_act_maxabs_pts_pow2_weights_maxabs_pcs_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_scales,
+            matmul_scales_to_mod_config,
+        ),
+        # kv_cache is pts as op in hw doesn't work in pcs
+        "kv_cache": (
+            kv_cache_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            kv_cache_scales_to_mod_config,
+        ),
+        "fused_sdpa": (
+            fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_scales,
+            fsdpa_scales_to_mod_config,
+        ),
+    },
+    "act_maxabs_pts_pow2_weights_opt_pcs_pow2": {
+        "linear": (
+            linear_act_maxabs_pts_pow2_weights_opt_pcs_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_scales,
+            matmul_scales_to_mod_config,
+        ),
+        # kv_cache is pts as op in hw doesn't work in pcs
+        "kv_cache": (
+            kv_cache_act_maxabs_pts_pow2_weight_opt_pcs_pow2_scales,
+            kv_cache_scales_to_mod_config,
+        ),
+        "fused_sdpa": (
+            fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_scales,
+            fsdpa_scales_to_mod_config,
+        ),
+    },
+    "smoothquant_weights_opt_pow2": {
+        "linear": (
+            linear_smoothquant_weights_opt_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            matmul_scales_to_mod_config,
+        ),
+    },
+    "smoothquant_weights_maxabs_pow2": {
+        "linear": (
+            linear_smoothquant_weights_maxabs_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            matmul_scales_to_mod_config,
+        ),
+    },
+    "weaksmoothquant_weights_maxabs_pow2": {
+        "linear": (
+            linear_weaksmoothquant_weights_maxabs_pow2_scales,
+            linear_scales_to_mod_config,
+        ),
+        "matmul": (
+            matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales,
+            matmul_scales_to_mod_config,
+        ),
+    },
+}
+
+scale_method_mapping = {
+    (ScaleMethod.UNIT_SCALE, "maxabs"): "unit_scale",
+    (ScaleMethod.UNIT_SCALE, "maxabs_per_channel"): "unit_scale",
+    (ScaleMethod.MAXABS_HW, "maxabs"): "act_maxabs_pts_weight_maxabs_pts_pow2_hw",
+    (ScaleMethod.MAXABS_POW2, "maxabs"): "act_maxabs_pts_weight_maxabs_pts_pow2",
+    (ScaleMethod.MAXABS_HW_OPT_WEIGHT, "maxabs"): "act_maxabs_pts_weight_opt_pts_hw",
+    (
+        ScaleMethod.MAXABS_POW2_OPT_WEIGHT,
+        "maxabs",
+    ): "act_maxabs_pts_weight_opt_pts_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2,
+        "maxabs",
+    ): "act_maxabs_pts_pow2_hw_weights_maxabs_pcs_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2,
+        "maxabs_per_channel",
+    ): "act_maxabs_pts_pow2_hw_weights_maxabs_pcs_pow2",
+    (
+        ScaleMethod.SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
+        "maxabs_per_channel",
+    ): "smoothquant_weights_maxabs_pow2",
+    (
+        ScaleMethod.WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
+        "maxabs_per_channel",
+    ): "weaksmoothquant_weights_maxabs_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_HW_WEIGHTS_PCS_OPT_POW2,
+        "maxabs",
+    ): "act_maxabs_pts_pow2_hw_weights_opt_pcs_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_HW_WEIGHTS_PCS_OPT_POW2,
+        "maxabs_per_channel",
+    ): "act_maxabs_pts_pow2_hw_weights_opt_pcs_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2,
+        "maxabs",
+    ): "act_maxabs_pts_pow2_weights_maxabs_pcs_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2,
+        "maxabs_per_channel",
+    ): "act_maxabs_pts_pow2_weights_maxabs_pcs_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2,
+        "maxabs",
+    ): "act_maxabs_pts_pow2_weights_opt_pcs_pow2",
+    (
+        ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2,
+        "maxabs_per_channel",
+    ): "act_maxabs_pts_pow2_weights_opt_pcs_pow2",
+    (
+        ScaleMethod.WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
+        "maxabs_per_channel",
+    ): "weaksmoothquant_weights_maxabs_pow2",
+    (
+        ScaleMethod.SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
+        "maxabs_per_channel",
+    ): "smoothquant_weights_maxabs_pow2",
+    (ScaleMethod.SMOOTHQUANT_OPT, "maxabs_per_channel"): "smoothquant_weights_opt_pow2",
+}
+
+scaling_params = {
+    "unit_scale": {},
+    "act_maxabs_pts_weight_maxabs_pts_pow2_hw": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+    },
+    "act_maxabs_pts_weight_maxabs_pts_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+    },
+    "act_maxabs_pts_weight_opt_pts_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+        "weight_scales": [2.0**s for s in range(-10, 10)],
+    },
+    "act_maxabs_pts_weight_opt_pts_hw": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+        "weight_scales": [2.0**s for s in [4, 0, -4, -8]],
+    },
+    "smoothquant_weights_maxabs_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+        "alpha": 0.5,
+    },
+    "weaksmoothquant_weights_maxabs_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+        "alpha": 0.5,
+    },
+    "act_maxabs_pts_pow2_hw_weights_maxabs_pcs_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+    },
+    "act_maxabs_pts_pow2_hw_weights_opt_pcs_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+        "weight_scales": [2.0**s for s in range(-3, 5)],
+    },
+    "act_maxabs_pts_pow2_weights_maxabs_pcs_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+    },
+    "act_maxabs_pts_pow2_weights_opt_pcs_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+        "weight_scales": [2.0**s for s in range(-3, 5)],
+    },
+    "smoothquant_weights_opt_pow2": {
+        "input_backoff": 0.25,
+        "weight_backoff": 0.5,
+        "alpha": 0.5,
+        "transformed_weight_scales": [2.0**s for s in range(-3, 5)],
+    },
+}
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py
new file mode 100644
index 00000000000..1c0b11e3c99
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py
@@ -0,0 +1,3 @@
+from .max_abs import *
+from .unit_scale import *
+from .smooth_quant import *
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py
new file mode 100644
index 00000000000..d991a68aca2
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py
@@ -0,0 +1,397 @@
+import torch
+
+from ..fp_utils import *
+from ..common import *
+
+
+def linear_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    weight_scale = calc_maxabs_scale(
+        torch.max(torch.abs(mod.weight.detach())).to(dtype=hp_dtype, device=device),
+        fullscale,
+        weight_backoff,
+    )
+    input_scale = scale_to_pow2_hw(input_scale, device_type=config["device_type"])
+    weight_scale = scale_to_pow2_hw(weight_scale, device_type=config["device_type"])
+    output_scale = input_scale * weight_scale
+    return ModuleConfig((input_scale,), (output_scale,), {"weight": weight_scale})
+
+
+def linear_act_maxabs_pts_weight_maxabs_pts_pow2_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    weight_scale = calc_maxabs_scale(
+        torch.max(torch.abs(mod.weight.detach())).to(dtype=hp_dtype, device=device),
+        fullscale,
+        weight_backoff,
+    )
+    input_scale = scale_to_pow2(input_scale)
+    weight_scale = scale_to_pow2(weight_scale)
+    output_scale = input_scale * weight_scale
+    return ModuleConfig((input_scale,), (output_scale,), {"weight": weight_scale})
+
+
+def matmul_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    input_scale = [
+        calc_maxabs_scale(
+            torch.tensor(x, dtype=hp_dtype, device=device).max(),
+            fullscale,
+            input_backoff,
+        )
+        for x in measurement.inputs
+    ]
+    input_scale = [scale_to_pow2_hw(x, device_type=config["device_type"]) for x in input_scale]
+    output_scale = [input_scale[0] * input_scale[1]]
+    return ModuleConfig(input_scale, output_scale, {})
+
+
+def matmul_act_maxabs_pts_weight_maxabs_pts_pow2_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    input_scale = [
+        calc_maxabs_scale(
+            torch.tensor(x, dtype=hp_dtype, device=device).max(),
+            fullscale,
+            input_backoff,
+        )
+        for x in measurement.inputs
+    ]
+    input_scale = [scale_to_pow2(x) for x in input_scale]
+    output_scale = [input_scale[0] * input_scale[1]]
+    return ModuleConfig(input_scale, output_scale, {})
+
+
+def fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    input_scale = [
+        calc_maxabs_scale(
+            torch.tensor(x, dtype=hp_dtype, device=device).max(),
+            fullscale,
+            input_backoff,
+        )
+        for x in measurement.inputs
+    ]
+    # add amax scale to input scales
+    input_scale.append(
+        calc_maxabs_scale(
+            torch.tensor(measurement.outputs[1], dtype=hp_dtype, device=device).max(),
+            fullscale,
+            input_backoff,
+        )
+    )
+    input_scale = [scale_to_pow2_hw(x, device_type=config["device_type"]) for x in input_scale]
+    output_scale = calc_maxabs_scale(
+        torch.tensor(measurement.outputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    output_scale = [scale_to_pow2_hw(output_scale, device_type=config["device_type"])]
+    return ModuleConfig(input_scale, output_scale, {})
+
+
+def fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    input_scale = [
+        calc_maxabs_scale(
+            torch.tensor(x, dtype=hp_dtype, device=device).max(),
+            fullscale,
+            input_backoff,
+        )
+        for x in measurement.inputs
+    ]
+    # fsdpa is combined out of - BMM1(Q,K) -> Softmax -> BMM2(AMAX,V)
+    # during measure we recieve the amax value from the cguid and apply it during quant as input
+    input_scale.append(
+        calc_maxabs_scale(
+            torch.tensor(measurement.outputs[1], dtype=hp_dtype, device=device).max(),
+            fullscale,
+            input_backoff,
+        )
+    )
+    input_scale = [scale_to_pow2(x) for x in input_scale]
+    output_scale = calc_maxabs_scale(
+        torch.tensor(measurement.outputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    output_scale = [scale_to_pow2(output_scale)]
+    return ModuleConfig(input_scale, output_scale, {})
+
+
+def linear_act_maxabs_pts_weight_opt_pts_pow2_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    scales = params["weight_scales"]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    weight_scale = mmse_scale(mod.weight, scales, lp_dtype, hp_dtype)
+    input_scale = scale_to_pow2(input_scale)
+    weight_scale = scale_to_pow2(weight_scale)
+    output_scale = input_scale * weight_scale
+    return ModuleConfig((input_scale,), (output_scale,), {"weight": weight_scale})
+
+
+def linear_act_maxabs_pts_weight_opt_pts_hw_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    scales = params["weight_scales"]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    weight_scale = mmse_scale(mod.weight, scales, lp_dtype, hp_dtype)
+    input_scale = scale_to_pow2_hw(input_scale, device_type=config["device_type"])
+    weight_scale = scale_to_pow2_hw(weight_scale, device_type=config["device_type"])
+    output_scale = input_scale * weight_scale
+    return ModuleConfig((input_scale,), (output_scale,), {"weight": weight_scale})
+
+
+def kv_cache_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    # calc the scale per layer tensor
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    input_scale_list = [scale_to_pow2_hw(input_scale, device_type=config["device_type"])]
+    output_scale = [input_scale_list[0]]  # output scale is same as the first input (current data) since range is same
+    return ModuleConfig(input_scale_list, output_scale, {})
+
+
+def kv_cache_act_maxabs_pts_pow2_weight_opt_pcs_pow2_scales(mod, measurement, params):
+    # calc the scale per layer tensor
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    input_scale_list = [scale_to_pow2(input_scale)]
+    output_scale = [input_scale_list[0]]  # output scale is same as the first input (current data) since range is same
+    return ModuleConfig(input_scale_list, output_scale, {})
+
+
+def softmax_input_unit_output_maxabs_pts_hw_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    output_scale = calc_maxabs_scale(
+        torch.tensor(measurement.outputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    output_scale = [scale_to_pow2_hw(output_scale, device_type=config["device_type"])]
+    return ModuleConfig((), output_scale, {})
+
+
+def linear_act_maxabs_pts_pow2_hw_weights_maxabs_pcs_pow2_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    input_scale = scale_to_pow2_hw(input_scale, device_type=config["device_type"])
+    weight_scale_in_ch = torch.ones([mod.weight.shape[1], 1], dtype=hp_dtype, device=device)
+
+    weight_range_out_ch = torch.max(torch.abs(mod.weight), dim=1)[0].reshape([-1, 1])
+    weight_maxabs_scale_out_ch = calc_maxabs_scale(weight_range_out_ch, fullscale, weight_backoff)
+    weight_maxabs_scale_out_ch = scale_to_pow2(weight_maxabs_scale_out_ch)
+    output_scale = weight_maxabs_scale_out_ch * input_scale
+    return ModuleConfig(
+        (input_scale.flatten(),),
+        (output_scale.flatten(),),
+        {
+            "weight": {
+                0: weight_maxabs_scale_out_ch.flatten(),
+                1: weight_scale_in_ch.flatten(),
+            }
+        },
+    )
+
+
+def linear_act_maxabs_pts_pow2_weights_maxabs_pcs_pow2_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    input_scale = scale_to_pow2(input_scale)
+    weight_scale_in_ch = torch.ones([mod.weight.shape[1], 1], dtype=hp_dtype, device=device)
+
+    weight_range_out_ch = torch.max(torch.abs(mod.weight), dim=1)[0].reshape([-1, 1])
+    weight_maxabs_scale_out_ch = calc_maxabs_scale(weight_range_out_ch, fullscale, weight_backoff)
+    weight_maxabs_scale_out_ch = scale_to_pow2(weight_maxabs_scale_out_ch)
+    output_scale = weight_maxabs_scale_out_ch * input_scale
+    return ModuleConfig(
+        (input_scale.flatten(),),
+        (output_scale.flatten(),),
+        {
+            "weight": {
+                0: weight_maxabs_scale_out_ch.flatten(),
+                1: weight_scale_in_ch.flatten(),
+            }
+        },
+    )
+
+
+def linear_act_maxabs_pts_pow2_hw_weights_opt_pcs_pow2_scales(mod, measurement, params):
+    config = get_hqt_config(mod).cfg
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    weight_scales = params["weight_scales"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    input_scale = scale_to_pow2_hw(input_scale, device_type=config["device_type"])
+    weight_scale_in_ch = torch.ones([mod.weight.shape[1], 1], dtype=hp_dtype, device=device)
+
+    weight_range_out_ch = torch.max(torch.abs(mod.weight), dim=1)[0].reshape([-1, 1])
+    weight_maxabs_scale_out_ch = calc_maxabs_scale(weight_range_out_ch, fullscale, weight_backoff)
+    weight_maxabs_scale_out_ch = scale_to_pow2(weight_maxabs_scale_out_ch)
+    weight_opt_scale_out_ch = mmse_scale_multi(
+        torch.transpose(mod.weight, 0, 1),
+        weight_maxabs_scale_out_ch.squeeze(),
+        weight_scales,
+        lp_dtype,
+        hp_dtype,
+    ).unsqueeze(1)
+    weight_maxabs_scale_out_ch = weight_opt_scale_out_ch
+    weight_maxabs_scale_out_ch = scale_to_pow2(weight_maxabs_scale_out_ch)  # should be power of 2, just making sure
+    output_scale = weight_maxabs_scale_out_ch * input_scale
+    return ModuleConfig(
+        (input_scale.flatten(),),
+        (output_scale.flatten(),),
+        {
+            "weight": {
+                0: weight_maxabs_scale_out_ch.flatten(),
+                1: weight_scale_in_ch.flatten(),
+            }
+        },
+    )
+
+
+def linear_act_maxabs_pts_pow2_weights_opt_pcs_pow2_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    weight_scales = params["weight_scales"]
+    input_scale = calc_maxabs_scale(
+        torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max(),
+        fullscale,
+        input_backoff,
+    )
+    input_scale = scale_to_pow2(input_scale)
+    weight_scale_in_ch = torch.ones([mod.weight.shape[1], 1], dtype=hp_dtype, device=device)
+
+    weight_range_out_ch = torch.max(torch.abs(mod.weight), dim=1)[0].reshape([-1, 1])
+    weight_maxabs_scale_out_ch = calc_maxabs_scale(weight_range_out_ch, fullscale, weight_backoff)
+    weight_maxabs_scale_out_ch = scale_to_pow2(weight_maxabs_scale_out_ch)
+    weight_opt_scale_out_ch = mmse_scale_multi(
+        torch.transpose(mod.weight, 0, 1),
+        weight_maxabs_scale_out_ch.squeeze(),
+        weight_scales,
+        lp_dtype,
+        hp_dtype,
+    ).unsqueeze(1)
+    weight_maxabs_scale_out_ch = weight_opt_scale_out_ch
+    weight_maxabs_scale_out_ch = scale_to_pow2(weight_maxabs_scale_out_ch)  # should be power of 2, just making sure
+    output_scale = weight_maxabs_scale_out_ch * input_scale
+    return ModuleConfig(
+        (input_scale.flatten(),),
+        (output_scale.flatten(),),
+        {
+            "weight": {
+                0: weight_maxabs_scale_out_ch.flatten(),
+                1: weight_scale_in_ch.flatten(),
+            }
+        },
+    )
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py
new file mode 100644
index 00000000000..3a216e6ef15
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py
@@ -0,0 +1,118 @@
+import torch
+from tqdm import tqdm
+
+from ..fp_utils import *
+from ..common import *
+
+
+def linear_smoothquant_weights_opt_pow2_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    alpha = params["alpha"]
+    transformed_weight_scales = params["transformed_weight_scales"]
+    input_range = torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device)
+    weight_range_in_ch = torch.max(torch.abs(mod.weight), dim=0)[0].reshape([-1, 1])
+    input_scale = calc_maxabs_scale(input_range, fullscale, input_backoff)
+    weight_scale_in_ch = calc_maxabs_scale(weight_range_in_ch, fullscale, weight_backoff)
+    input_scale = (input_scale**alpha) / (weight_scale_in_ch ** (1 - alpha))
+    input_scale = scale_to_pow2(input_scale)
+    weight_scale_in_ch = 1 / input_scale
+    trans_weight = scale_fcn(mod.weight, weight_scale_in_ch.reshape([1, -1]))
+    trans_weight_range_out_ch = torch.max(torch.abs(trans_weight), dim=1)[0].reshape([-1, 1])
+    trans_weight_maxabs_scale_out_ch = calc_maxabs_scale(trans_weight_range_out_ch, fullscale, weight_backoff)
+    trans_weight_maxabs_scale_out_ch = scale_to_pow2(trans_weight_maxabs_scale_out_ch)
+    trans_weight_scale_out_ch = torch.zeros(mod.weight.shape[0])
+    for k in tqdm(range(trans_weight_scale_out_ch.shape[0])):
+        trans_weight_scale_out_ch[k] = mmse_scale(
+            trans_weight[k, :],
+            [s * trans_weight_maxabs_scale_out_ch[k] for s in transformed_weight_scales],
+            lp_dtype,
+            hp_dtype,
+        )
+    weight_scale_out_ch = scale_to_pow2(trans_weight_scale_out_ch)
+    output_scale = torch.tensor(weight_scale_out_ch, dtype=hp_dtype, device=device)
+    return ModuleConfig(
+        (input_scale.flatten(),),
+        (output_scale.flatten(),),
+        {"weight": {0: weight_scale_out_ch.flatten(), 1: weight_scale_in_ch.flatten()}},
+    )
+
+
+def linear_smoothquant_weights_maxabs_pow2_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    alpha = params["alpha"]
+    input_range = torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device)
+    weight_range_in_ch = torch.max(torch.abs(mod.weight), dim=0)[0].reshape([-1, 1])
+    input_scale = calc_maxabs_scale(input_range, 1.0, 1.0)
+    weight_scale_in_ch = calc_maxabs_scale(weight_range_in_ch, 1.0, 1.0)
+    input_scale = (input_scale**alpha) / (weight_scale_in_ch ** (1 - alpha))
+    input_scale = scale_to_pow2(input_scale)
+    input_range_post = input_range / input_scale
+    input_scale_post = calc_maxabs_scale(input_range_post.max(), fullscale, input_backoff)
+    input_scale_post = scale_to_pow2(input_scale_post)
+    input_scale = input_scale * input_scale_post
+    weight_scale_in_ch = 1 / input_scale
+    trans_weight = scale_fcn(mod.weight, weight_scale_in_ch.reshape([1, -1]))
+    trans_weight_range_out_ch = torch.max(torch.abs(trans_weight), dim=1)[0].reshape([-1, 1])
+    trans_weight_maxabs_scale_out_ch = calc_maxabs_scale(trans_weight_range_out_ch, fullscale, weight_backoff)
+    trans_weight_maxabs_scale_out_ch = scale_to_pow2(trans_weight_maxabs_scale_out_ch)
+    weight_scale_out_ch = scale_to_pow2(trans_weight_maxabs_scale_out_ch)
+    output_scale = torch.tensor(weight_scale_out_ch, dtype=hp_dtype, device=device)
+    return ModuleConfig(
+        (input_scale.flatten(),),
+        (output_scale.flatten(),),
+        {"weight": {0: weight_scale_out_ch.flatten(), 1: weight_scale_in_ch.flatten()}},
+    )
+
+
+def linear_weaksmoothquant_weights_maxabs_pow2_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    lp_dtype = params["lp_dtype"]
+    hp_dtype = params["hp_dtype"]
+    fullscale = MAX_RANGE[lp_dtype]
+    input_backoff = params["input_backoff"]
+    weight_backoff = params["weight_backoff"]
+    alpha = params["alpha"]
+    input_range = torch.tensor(measurement.inputs[0], dtype=hp_dtype, device=device).max().clamp(min=1e-5)
+    input_range_mid = input_range.max() / torch.sqrt(input_range.max() / input_range.min().clamp(min=1e-5))
+    input_scale_pcs = calc_maxabs_scale(input_range.clamp(min=1e-5), input_range_mid, 1.0).clamp(min=1e-5)
+    weight_range_in_ch = torch.max(torch.abs(mod.weight), dim=0)[0].reshape([-1, 1]).clamp(min=1e-5)
+    weight_range_in_ch_mid = weight_range_in_ch.max() / torch.sqrt(
+        weight_range_in_ch.max() / weight_range_in_ch.min().clamp(min=1e-5)
+    ).clamp(min=1e-5)
+    weight_scale_pcs = calc_maxabs_scale(weight_range_in_ch.clamp(min=1e-5), weight_range_in_ch_mid, 1.0).clamp(
+        min=1e-5
+    )
+
+    input_scale = ((input_scale_pcs**alpha) / (weight_scale_pcs ** (1 - alpha))).clamp(min=1e-5)
+    input_scale = scale_to_pow2(input_scale)
+    input_scale_post = calc_maxabs_scale((input_range / input_scale).max(), fullscale, input_backoff)
+    input_scale_post = scale_to_pow2(input_scale_post)
+
+    weight_scale_in_ch = torch.ones([mod.weight.shape[1], 1], dtype=hp_dtype, device=device) * (1 / input_scale)
+
+    trans_weight = scale_fcn(mod.weight, weight_scale_in_ch.reshape([1, -1]))
+    weight_range_out_ch = torch.max(torch.abs(trans_weight), dim=1)[0].reshape([-1, 1])
+
+    weight_maxabs_scale_out_ch = calc_maxabs_scale(weight_range_out_ch, fullscale, weight_backoff)
+    weight_maxabs_scale_out_ch = scale_to_pow2(weight_maxabs_scale_out_ch)
+    output_scale = weight_maxabs_scale_out_ch * input_scale_post
+    return ModuleConfig(
+        (input_scale.flatten() * input_scale_post,),
+        (output_scale.flatten(),),
+        {
+            "weight": {
+                0: weight_maxabs_scale_out_ch.flatten(),
+                1: weight_scale_in_ch.flatten(),
+            }
+        },
+    )
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py
new file mode 100644
index 00000000000..6be7673aace
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py
@@ -0,0 +1,52 @@
+import torch
+
+from ..fp_utils import *
+from ..common import *
+
+
+def linear_unit_scale_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    hp_dtype = params["hp_dtype"]
+    input_scale = torch.tensor(1.0, dtype=hp_dtype, device=device)
+    weight_scale = torch.tensor(1.0, dtype=hp_dtype, device=device)
+    output_scale = torch.tensor(1.0, dtype=hp_dtype, device=device)
+    return ModuleConfig((input_scale,), (output_scale,), {"weight": weight_scale})
+
+
+def fsdpa_unit_scale_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    hp_dtype = torch.float32  #  params["hp_dtype"]
+    q_scale = torch.tensor(1.0, dtype=hp_dtype, device=device)
+    k_scale = torch.tensor(1.0, dtype=hp_dtype, device=device)
+    v_scale = torch.tensor(1.0, dtype=hp_dtype, device=device)
+    softmax_scale = torch.tensor(1.0, dtype=hp_dtype, device=device)
+    input_scale = (q_scale, k_scale, v_scale, softmax_scale)
+    output_scale = (torch.tensor(1.0, dtype=hp_dtype, device=device),)
+    return ModuleConfig(input_scale, output_scale, {})
+
+
+def matmul_unit_scale_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    hp_dtype = params["hp_dtype"]
+    input_scale = (
+        torch.tensor(1.0, dtype=hp_dtype, device=device),
+        torch.tensor(1.0, dtype=hp_dtype, device=device),
+    )
+    output_scale = (torch.tensor(1.0, dtype=hp_dtype, device=device),)
+    return ModuleConfig(input_scale, output_scale, {})
+
+
+def softmax_unit_scale_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    hp_dtype = params["hp_dtype"]
+    input_scale = (torch.tensor(1.0, dtype=hp_dtype, device=device),)
+    output_scale = (torch.tensor(1.0, dtype=hp_dtype, device=device),)
+    return ModuleConfig(input_scale, output_scale)
+
+
+def kv_cache_unit_scale_scales(mod, measurement, params):
+    device = torch.device("hpu")
+    hp_dtype = params["hp_dtype"]
+    input_scale = (torch.tensor(1.0, dtype=hp_dtype, device=device),)
+    output_scale = (torch.tensor(1.0, dtype=hp_dtype, device=device),)
+    return ModuleConfig(input_scale, output_scale)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
new file mode 100644
index 00000000000..a4652bd1755
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
@@ -0,0 +1,49 @@
+from .measure import prepare_model as prepare_model_for_measure
+from .quantize import quantize
+from .scale import scaling_params, scale_method_mapping
+from .._quant_common.quant_config import QuantMode, get_hqt_config
+
+from .._quant_common.helper_modules import *
+from ..utils.logger import logger
+from .common import mod_default_dict
+
+def update_mod_dict(config):
+  assert len(config.cfg['mod_dict']) == 0, f"Custom modules are not supported: {config.cfg['mod_dict'].keys()}. Please add it in the code."
+  config.cfg['mod_dict'].update({k: mod_default_dict[k].type for k in mod_default_dict})
+
+def print_init_info(config):
+  import importlib.metadata
+  versionStr = importlib.metadata.version('habana_quantization_toolkit')
+  locationStr = versionStr.find('git') + 3
+  logger.info("HQT Git revision = %s", versionStr[locationStr:])
+  logger.info("HQT Configuration = %s", config)
+
+def is_substr(substr_list, target):
+  return any([x in target for x in substr_list])
+
+def prepare_model(model):
+  config = get_hqt_config(model)
+  update_mod_dict(config)
+  allowlist=set(config.cfg['mod_dict'].keys())
+  blocklist=set()
+  for type_st in config.cfg['blocklist']['types']:
+    blocklist.add(type_st)
+  allowlist.difference_update(blocklist)
+  allowlist_tuple=tuple(allowlist)
+  mod_list=[]
+  for name, mod in model.named_modules():
+    mod_type=mod.__class__.__name__
+    if (mod_type in allowlist_tuple) and (is_substr(config.cfg['allowlist']['names'], name) or len(config.cfg['allowlist']['names'])==0) and (not is_substr(config.cfg['blocklist']['names'], name)):
+      mod_list.append(name)
+
+  print_init_info(config)
+
+  logger.debug("Module list: %s", mod_list)
+  logger.info("Total modules : %d", len(mod_list))
+  if (config.cfg['mode']==QuantMode.MEASURE) or (config.cfg['mode']==QuantMode.SHAPE):
+    return prepare_model_for_measure(model, mod_list)
+  elif config.cfg['mode']==QuantMode.QUANTIZE:
+    scaling_method_name = scale_method_mapping[(config.cfg['scale_method'], config.cfg['observer'])]
+    scaling_params[scaling_method_name].update(config.cfg['scale_params'])
+    config.cfg['scale_params'] = scaling_params[scaling_method_name]
+    return quantize(model, mod_list)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
new file mode 100644
index 00000000000..61d26f081ff
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -0,0 +1,812 @@
+import torch.nn as nn
+import torch
+
+from .quant_config import QuantMode, get_hqt_config, set_hqt_config
+
+try:  # backwards compatibility for 1.16
+    from habana_frameworks.torch.hpex.kernels import fp8_fused_sdpa
+except ImportError:
+    pass
+
+
+class BMM(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return torch.bmm(x, y)
+
+
+class Matmul(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        return torch.matmul(*args, **kwargs)
+
+
+class Identity(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x.clone()
+
+
+class Softmax(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, dim=None):
+        return torch.softmax(x, dim)
+
+
+def matmul_fp8(
+    input,
+    other,
+    out=None,
+    out_dtype=torch.bfloat16,
+    scale_input_inv=None,
+    scale_other_inv=None,
+):
+    res = torch.ops.hpu.fp8_gemm_v2(
+        input,
+        False,
+        other,
+        False,
+        out,
+        out_dtype,
+        scale_input_inv,
+        scale_other_inv,
+        None,
+        False,
+    )
+    return res
+
+
+def measure_input(input, observer):
+    for i in range(len(observer)):
+        observer[i].measure(input[i])
+
+
+def measure_output(output, observer):
+    if observer:
+        for i in range(len(observer)):
+            observer[i].measure(output[i])
+
+
+def conv2d_fp8(
+    input,
+    other,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    out_dtype=torch.bfloat16,
+    scale_input_inv=None,
+    scale_other_inv=None,
+):
+    return torch.ops.hpu.conv2d_fp8(
+        input=input,
+        weight=other,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        out_dtype=out_dtype,
+        scale_input=scale_input_inv,
+        scale_weight=scale_other_inv,
+    )
+
+
+def set_attrs_from_orig_model(cls_instance, mod, mod_extra_config, *func_names):
+    cls_instance.__dict__.update(mod.__dict__)
+    config = get_hqt_config(cls_instance)
+    cls_instance.extra_repr_org = mod.extra_repr
+    cls_instance.class_name_org = mod.__class__.__name__
+    cls_instance._mod_extra_config = mod_extra_config
+    cls_instance.quantization_mode = config.cfg["mode"]
+    cls_instance.forward_orig = mod.forward
+    if func_names is not None:
+        for func in func_names:
+            setattr(cls_instance, func, getattr(mod, func))
+
+
+def get_current_repr(cls_instance, *member_names):
+    curr_repr = ""
+    if cls_instance.quantization_mode == QuantMode.QUANTIZE:
+        first_name = True
+        for name in member_names:
+            if not first_name:
+                curr_repr += ", "
+            curr_repr += f"{name} dtype={getattr(cls_instance, name).dtype}"
+            first_name = False
+    return curr_repr
+
+
+def extra_representation(org_repr, org_name, curr_repr):
+    repr = f"original={org_name}," + (" " + org_repr + "," if org_repr != "" else "")
+    return f"{repr} {curr_repr}"
+
+
+def _raise_lora_layer_error(layer_class):
+    raise RuntimeError(
+        f"{layer_class} quantization is not supported in case of lora_layer member is not None."
+        f" Can add {layer_class} to 'blocklist' field in quantization config file"
+    )
+
+
+class PatchedMatmul(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input_0 = self._mod_extra_config.inputs[0]
+            self.quant_input_1 = self._mod_extra_config.inputs[1]
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            self.scale_other = nn.Parameter(mod_extra_config.scale.inputs[1])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input, other):
+        qinput = self.quant_input_0(input)
+        qother = self.quant_input_1(other)
+        output = matmul_fp8(
+            qinput,
+            qother,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_other,
+        )
+        return output
+
+    def forward_measure(self, input, other):
+        measure_input((input, other), observer=self._mod_extra_config.inputs)
+        output = self.forward_orig(input, other)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_other"),
+        )
+
+
+class PatchedLinear(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            # When offloading weights to disk using device_map, the module forward is overridden.
+            # __dict__.update call again overrides the PatchedLinear forward with the forward that device_map planted.
+            # So need to set PatchedLinear forawrd to be the right forward.
+            self.forward = self.forward_quant
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.weight = nn.Parameter(self.weight.t().contiguous())
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            if isinstance(mod_extra_config.scale.params["weight"], (torch.Tensor, float)):
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+            elif isinstance(mod_extra_config.scale.params["weight"], dict):
+                # PCQ weight is calculated with actual weight [0] and ones [1]
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"][0])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward_quant(self, input):
+        qinput = self.quant_input(input)
+        y = matmul_fp8(
+            qinput,
+            self.weight,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_weight,
+        )
+        output = y + self.bias if (self.bias is not None) else y
+        return output
+
+    def forward_measure(self, input):
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output = self.forward_orig(input)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedLinearAllReduce(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        self.scoped_version = mod.__class__.__name__ == "ScopedLinearAllReduce"
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.quant_output = self._mod_extra_config.outputs[0]
+            self.weight = nn.Parameter(self.weight.t().contiguous())
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            if isinstance(mod_extra_config.scale.params["weight"], (torch.Tensor, float)):
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+            elif isinstance(mod_extra_config.scale.params["weight"], dict):
+                # PCQ weight is calculated with actual weight [0] and ones [1]
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"][0])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input):
+        # pre_all_reduce
+        qinput = self.quant_input(input)
+        output = matmul_fp8(
+            qinput,
+            self.weight,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_weight,
+        )
+        dqoutput = self.quant_output(output)
+        if not self.scoped_version:
+            self.all_reduce(dqoutput)
+            dqoutput = self.post_all_reduce(dqoutput)
+        return dqoutput
+
+    def forward_measure(self, input):
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        measure_output((output,), self._mod_extra_config.outputs)
+        # in scoped version all reduce is being called outside of the layer
+        if not self.scoped_version:
+            self.all_reduce(output)
+            output = self.post_all_reduce(output)
+        return output
+
+    def all_reduce(self, input):
+        if self.mp_group is not None:
+            from deepspeed import comm as dist
+
+            dist.inference_all_reduce(input, group=self.mp_group)
+
+    def post_all_reduce(self, input):
+        output = input + self.bias if (self.bias is not None) else input
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedRowParallelLinear(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config, "resolve_input")
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.quant_output = self._mod_extra_config.outputs[0]
+            self.weight = nn.Parameter(self.weight.t().contiguous())
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            if isinstance(mod_extra_config.scale.params["weight"], (torch.Tensor, float)):
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+            elif isinstance(mod_extra_config.scale.params["weight"], dict):
+                # PCQ weight is calculated with actual weight [0] and ones [1]
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"][0])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input):
+        resolved_input = self.resolve_input(input)
+        qinput = self.quant_input(resolved_input)
+        output = matmul_fp8(
+            qinput,
+            self.weight,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_weight,
+        )
+        dqoutput = self.quant_output(output)
+        if self.reduce_results:
+            dqoutput = self.collective_func(dqoutput)
+        return self.post_all_reduce(dqoutput)
+
+    def forward_measure(self, input):
+        resolved_input = self.resolve_input(input)
+        measure_input((resolved_input,), observer=self._mod_extra_config.inputs)
+        output = torch.matmul(resolved_input, self.weight.transpose(-1, -2))
+        measure_output((output,), self._mod_extra_config.outputs)
+        if self.reduce_results:
+            output = self.collective_func(output)
+        return self.post_all_reduce(output)
+
+    def post_all_reduce(self, output):
+        assert (
+            self.reduce_results or (not self.bias) or self.skip_bias_add
+        ), "When not reduce the results, adding bias to the results can lead to incorrect results"
+        if not self.skip_bias_add:
+            output = output + self.bias if self.bias is not None else output
+            output_bias = None
+        else:
+            output_bias = self.bias
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedColumnParallelLinear(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.quant_output = self._mod_extra_config.outputs[0]
+            self.weight = nn.Parameter(self.weight.t().contiguous())
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            if isinstance(mod_extra_config.scale.params["weight"], (torch.Tensor, float)):
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+            elif isinstance(mod_extra_config.scale.params["weight"], dict):
+                # PCQ weight is calculated with actual weight [0] and ones [1]
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"][0])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input):
+        qinput = self.quant_input(input)
+        output = matmul_fp8(
+            qinput,
+            self.weight,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_weight,
+        )
+        dqoutput = self.quant_output(output)
+        if self.gather_output:
+            dqoutput = self.collective_func(dqoutput)
+        return self.post_all_reduce(dqoutput)
+
+    def forward_measure(self, input):
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        measure_output((output,), self._mod_extra_config.outputs)
+        if self.gather_output:
+            output = self.collective_func(output)
+        return self.post_all_reduce(output)
+
+    def post_all_reduce(self, output):
+        if not self.skip_bias_add:
+            output = output + self.bias if self.bias is not None else output
+            output_bias = None
+        else:
+            output_bias = self.bias
+        return output, output_bias
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedLmHeadLinearAllreduce(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.quant_output = self._mod_extra_config.outputs[0]
+            self.weight = nn.Parameter(self.weight.t().contiguous())
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            if isinstance(mod_extra_config.scale.params["weight"], (torch.Tensor, float)):
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+            elif isinstance(mod_extra_config.scale.params["weight"], dict):
+                # PCQ weight is calculated with actual weight [0] and ones [1]
+                self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"][0])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input):
+        assert (
+            input.shape[-1] % self.world_size == 0
+        ), "Please ensure that self.world_size is divisible by input.shape[-1]"
+        input_shard = input.shape[-1] // self.world_size
+        splittedInput = input[:, :, self.rank * input_shard : (self.rank + 1) * input_shard]
+        qinput = self.quant_input(splittedInput)
+        output = matmul_fp8(
+            qinput,
+            self.weight,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_weight,
+        )
+        dqoutput = self.quant_output(output)
+
+        if self.mp_group is not None:
+            from deepspeed import comm as dist
+
+            dist.inference_all_reduce(dqoutput, group=self.mp_group)
+        if self.bias is not None:
+            dqoutput += self.bias
+        return dqoutput
+
+    def forward_measure(self, input):
+        assert (
+            input.shape[-1] % self.world_size == 0
+        ), "Please ensure that self.world_size is divisible by input.shape[-1]"
+        input_shard = input.shape[-1] // self.world_size
+        splittedInput = input[:, :, self.rank * input_shard : (self.rank + 1) * input_shard]
+        measure_input((splittedInput,), observer=self._mod_extra_config.inputs)
+        output = torch.matmul(splittedInput, self.weight.t())
+        measure_output((output,), self._mod_extra_config.outputs)
+
+        if self.mp_group is not None:
+            from deepspeed import comm as dist
+
+            dist.inference_all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedKVCache(nn.Module):
+    # Module to patch KVCache module from llama model
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config, "forward", "get_shape")
+        self.org_allocate = mod.allocate
+        self.org_update = mod.update
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            mod.update = self.update
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.quant_output = self._mod_extra_config.outputs[0]
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.update = self.update_measure
+            mod.update = self.update_measure
+
+    # overwrite allocate function of original module to force allocation in fp8
+    def allocate(self, inp_seq_len, dtype, device, shape):
+        dtype = torch.float8_e4m3fn if (self.quantization_mode == QuantMode.QUANTIZE) else dtype
+        return self.org_allocate(inp_seq_len, dtype, device, shape)
+
+    # overwrite update function of original module to force quant and dequant of cache input and output
+    def update(self, prev, cur, dim, idx, inp_seq_len):
+        qinput = self.quant_input(cur)
+        output = self.org_update(prev, qinput, dim, idx, inp_seq_len)
+        if output.dtype == torch.float8_e4m3fn:
+            return self.quant_output(output)
+        else:
+            return output
+
+    # overwrite update function of original module to force quant and dequant of cache input and output
+    def update_measure(self, prev, cur, dim, idx, inp_seq_len):
+        measure_input((cur,), self._mod_extra_config.inputs)
+        output = self.org_update(prev, cur, dim, idx, inp_seq_len)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+
+class PatchedVLLMKVCache(nn.Module):
+    # Module to patch VLLMKVCache module from llama model
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.quant_output = self._mod_extra_config.outputs[0]
+            self.orig_fetch_from_cache = mod.fetch_from_cache
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.fetch_from_cache = mod.fetch_from_cache
+            self.forward = self.forward_measure
+
+    def forward(self, input, cache, block_indices, block_offset):
+        qinput = self.quant_input(input)
+        output_cache = self.forward_orig(qinput, cache, block_indices, block_offset)
+        return self.quant_output(output_cache)
+
+    def forward_measure(self, input, cache, block_indices, block_offset):
+        measure_input((input), self._mod_extra_config.inputs)
+        output_cache = self.forward_orig(input, cache, block_indices, block_offset)
+        measure_output((output_cache), self._mod_extra_config.outputs)
+        return output_cache
+
+    def fetch_from_cache(self, cache, blocks, permutations):
+        quant_cache = self.quant_input(cache)
+        output_cache = self.orig_fetch_from_cache(quant_cache, blocks, permutations)
+        for i in range(len(output_cache)):
+            output_cache[i]=self.quant_output(output_cache[i])
+        return output_cache
+
+
+class PatchedConv2d(nn.Conv2d):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input):
+        qinput = self.quant_input(input)
+        output = conv2d_fp8(
+            qinput,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_weight,
+        )
+        return output
+
+    def forward_measure(self, input):
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output = self.forward_orig(input)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedSoftmax(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_output = self._mod_extra_config.outputs[0]
+            # input scale is 1 assuming the input to SM is descaled because we are using HW supported scales
+            self.scale_input = nn.Parameter(torch.Tensor([1.0]))
+            self.scale_output = nn.Parameter(torch.Tensor([1 / mod_extra_config.scale.outputs[0]]))
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, x, dim=None, invAttnHead=None):
+        output = torch.ops.hpu.softmax_fp8(x, dim, self.scale_input, self.scale_output, invAttnHead)
+        return self.quant_output(output)
+
+    def forward_measure(self, x, dim=None, invAttnHead=None):
+        measure_input((x,), observer=self._mod_extra_config.inputs)
+        output = self.forward_orig(x, dim, invAttnHead)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_output"),
+        )
+
+
+class PatchedLoRACompatibleLinear(nn.Linear):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.weight = nn.Parameter(self.weight.t().contiguous())
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input, scale: float = 1.0):
+        qinput = self.quant_input(input)
+        y = matmul_fp8(
+            qinput,
+            self.weight,
+            out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+            scale_input_inv=self.scale_input,
+            scale_other_inv=self.scale_weight,
+        )
+        output = y + self.bias if (self.bias is not None) else y
+        if self.lora_layer is not None:
+            # TODO SW-174899 support lora layer quantization
+            _raise_lora_layer_error(self.class_name_org)
+            # output = output + (scale * self.lora_layer(input))
+        return output
+
+    def forward_measure(self, input, scale: float = 1.0):
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output = self.forward_orig(input, scale)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedLoRACompatibleConv(nn.Conv2d):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_input = self._mod_extra_config.inputs[0]
+            self.scale_input = nn.Parameter(mod_extra_config.scale.inputs[0])
+            self.scale_weight = nn.Parameter(mod_extra_config.scale.params["weight"])
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(self, input, scale: float = 1.0):
+        qinput = self.quant_input(input)
+        if self.lora_layer is not None:
+            # TODO SW-174899 support lora layer quantization
+            _raise_lora_layer_error(self.class_name_org)
+            # output = conv2d_fp8(qinput, self.weight, None, self.stride, self.padding, self.dilation, self.groups, \
+            #  out_dtype=self._mod_extra_config.config_params["hp_dtype"], scale_input_inv=self.scale_input, scale_other_inv=self.scale_weight)
+            # output = output + (scale * self.lora_layer(input))
+            # output = output+torch.unsqueeze(torch.unsqueeze(self.bias,1), 1) if (self.bias is not None) else output
+        else:
+            output = conv2d_fp8(
+                qinput,
+                self.weight,
+                self.bias,
+                self.stride,
+                self.padding,
+                self.dilation,
+                self.groups,
+                out_dtype=self._mod_extra_config.config_params["hp_dtype"],
+                scale_input_inv=self.scale_input,
+                scale_other_inv=self.scale_weight,
+            )
+        return output
+
+    def forward_measure(self, input, scale: float = 1.0):
+        measure_input((input,), observer=self._mod_extra_config.inputs)
+        output = self.forward_orig(input, scale)
+        measure_output((output,), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(self, "scale_input", "scale_weight"),
+        )
+
+
+class PatchedModuleFusedSDPA(nn.Module):
+    def __init__(self, mod, mod_extra_config, *args, **kwargs):
+        # fsdpa is combined out of - BMM1(Q,K) -> Softmax -> BMM2(AMAX,V)
+        # during measure we recieve the amax value from the cguid and apply it during quant as input
+        super().__init__()
+        set_attrs_from_orig_model(self, mod, mod_extra_config)
+        if self.quantization_mode == QuantMode.QUANTIZE:
+            self.quant_q = self._mod_extra_config.inputs[0]
+            self.quant_k = self._mod_extra_config.inputs[1]
+            self.quant_v = self._mod_extra_config.inputs[2]
+            self.dequant_output = self._mod_extra_config.outputs[0]
+            self.scale_q = nn.Parameter(mod_extra_config.scale.inputs[0].type(torch.float32))
+            self.scale_k = nn.Parameter(mod_extra_config.scale.inputs[1].type(torch.float32))
+            self.scale_v = nn.Parameter(mod_extra_config.scale.inputs[2].type(torch.float32))
+            self.descale_amax = nn.Parameter(mod_extra_config.scale.inputs[3].type(torch.float32))
+            self.scale_output = nn.Parameter(1 / mod_extra_config.scale.outputs[0].type(torch.float32))
+            self.scale_amax = nn.Parameter(1 / self.descale_amax)
+        elif (self.quantization_mode == QuantMode.MEASURE) or (self.quantization_mode == QuantMode.SHAPE):
+            self.forward = self.forward_measure
+
+    def forward(
+        self,
+        q,
+        k,
+        v,
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=None,
+        softmax_mode="None",
+    ):
+        qinput = self.quant_q(q).detach()
+        kinput = self.quant_k(k).detach()
+        vinput = self.quant_v(v).detach()
+        results = fp8_fused_sdpa(
+            qinput,
+            kinput,
+            vinput,
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=scale,
+            # fp8_fused_sdpa in fp8 mode supports only FastSoftmax
+            softmax_mode="None",
+            d_scale_q=self.scale_q,
+            d_scale_k=self.scale_k,
+            d_scale_v=self.scale_v,
+            q_scale_s=self.scale_amax,
+            q_scale_o=self.scale_output,
+            d_scale_s=self.descale_amax,
+            is_amax_s=False,
+        )
+        output = results[0]
+        d_out = self.dequant_output(output)
+        return d_out
+
+    def forward_measure(
+        self,
+        q,
+        k,
+        v,
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=None,
+        softmax_mode="fast",
+    ):
+        dq = q.detach()
+        dk = k.detach()
+        dv = v.detach()
+        measure_input((dq, dk, dv), observer=self._mod_extra_config.inputs)
+        results = fp8_fused_sdpa(
+            dq,
+            dk,
+            dv,
+            attn_mask=attn_mask,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            scale=scale,
+            # fp8_fused_sdpa in bf16 can use either FastSoftmax or regular
+            softmax_mode="fast",
+            is_amax_s=True,
+        )
+        output = results[0]
+        amax = results[1]
+        measure_output((output, amax), self._mod_extra_config.outputs)
+        return output
+
+    def extra_repr(self) -> str:
+        return extra_representation(
+            self.extra_repr_org(),
+            self.class_name_org,
+            get_current_repr(
+                self,
+                "scale_q",
+                "scale_k",
+                "scale_v",
+                "descale_amax",
+                "scale_amax",
+                "scale_output",
+            ),
+        )
+
+
+class PatchedUnmeasuredModule(nn.Module):
+    def __init__(self, name, *args, **kwargs):
+        super().__init__()
+        self.name = name
+
+    def forward(self, *args, **kwargs):
+        raise Exception(
+            "Error - Layer '{}' was called but was not quantized because no measures were supplied.".format(self.name)
+        )
+
+    def extra_repr(self) -> str:
+        return f"Dummy patch of {self.name} to raise excption as there are no measurements provided."
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
new file mode 100644
index 00000000000..10c94dea640
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -0,0 +1,250 @@
+from __future__ import annotations
+
+import json
+import os
+import torch
+from enum import Enum, Flag, auto
+from dataclasses import dataclass
+from json.decoder import JSONDecodeError
+from typing import Any, Mapping
+import habana_frameworks.torch.utils.experimental as htexp
+
+from ..utils.logger import logger
+
+local_rank = int(os.getenv("LOCAL_RANK", "-1"))
+world_size = int(os.getenv("WORLD_SIZE", "-1"))
+global_rank = int(os.getenv("RANK", "-1"))
+
+
+class QuantMode(Enum):
+    NONE = 0
+    QUANTIZE = 1
+    MEASURE = 2
+    SHAPE = 3
+
+
+class MeasureExclude(Flag):
+    NONE = auto()
+    INPUT = auto()
+    OUTPUT = auto()
+    PARAMS = auto()
+    ALL = auto()
+
+
+class ScaleMethod(Enum):
+    MAX = 1
+    UNIT_SCALE = 2
+    MAXABS_HW = 3
+    MAXABS_POW2 = 4
+    SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2 = 5
+    WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2 = 6
+    ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2 = 7
+    ACT_MAXABS_HW_WEIGHTS_PCS_OPT_POW2 = 8
+    ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2 = 9
+    ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2 = 10
+    SMOOTHQUANT_OPT = 11
+    MAXABS_HW_OPT_WEIGHT = 12
+    MAXABS_POW2_OPT_WEIGHT = 13
+
+
+def get_hqt_config(mod) -> Fp8cfg:
+    return mod.__hqt_config__
+
+
+def set_hqt_config(mod, config):
+    mod.__hqt_config__ = config
+
+
+@dataclass
+class Fp8cfg:
+    cfg: Mapping[str, Any]
+
+    def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
+        measured_global_config = {
+            "dump_stats_path": "stats",
+            "fp8_config": torch.float8_e4m3fn,  # The parameters of the chosen Quantization methed
+            "hp_dtype": torch.bfloat16,  # The parameters of the chosen Quantization methed
+            "blocklist": {
+                "names": [],
+                "types": (),
+            },  # types and names to not be quantized
+            "allowlist": {
+                "names": [],
+                "types": ("torch.nn.Linear", "torch.nn.Conv2d", "BMM"),
+            },  # types and names to be quantized. Allowlist by names is not yet implemented
+            "mode": QuantMode.QUANTIZE,  # Quantize or Measure
+            "scale_method": ScaleMethod.UNIT_SCALE,  # Method to quantize with
+            "scale_params": {},  # scaling parameters that are different then the default ones
+            "observer": "maxabs",  # Supported ['shape', 'maxabs', 'maxabs_per_channel', 'save']
+            "mod_dict": {},
+            "ignore_modules_wo_measures": False,  # Determines whether to fail quantization on modules without existing measures or not to quantize them
+            "local_rank": local_rank if local_rank >= 0 else None,
+            "global_rank": None,
+            "world_size": world_size if world_size >= 0 else None,
+            "seperate_measure_files": True,  # Determines whether to expect one or several measure files when using more than one gaudi
+            "device_type": htexp._get_device_type(),  # Determines device type: Gaudi2, Gaudi3...
+            "measure_exclude": MeasureExclude.OUTPUT,
+        }
+        # assert measured_global_config['allowlist']['names'] == [''], "Allowlist names not yet implemented"
+
+        # go over all user-defined keys from json, handle various cases
+        for keys in custom_config:
+            if keys == "mode":
+                if custom_config[keys] == "NONE":
+                    custom_config[keys] = QuantMode.NONE
+                elif custom_config[keys] == "QUANTIZE":
+                    custom_config[keys] = QuantMode.QUANTIZE
+                elif custom_config[keys] == "MEASURE":
+                    custom_config[keys] = QuantMode.MEASURE
+                elif custom_config[keys] == "SHAPE":
+                    custom_config[keys] = QuantMode.SHAPE
+                else:
+                    raise ValueError("invalid mode in custom config. Enter Quantize or Measure")
+
+            if keys == "measure_exclude":
+                if custom_config[keys] == "NONE":
+                    custom_config[keys] = MeasureExclude.NONE
+                elif custom_config[keys] == "OUTPUT":
+                    custom_config[keys] = MeasureExclude.OUTPUT
+                elif custom_config[keys] == "INPUT":
+                    custom_config[keys] = MeasureExclude.INPUT
+                elif custom_config[keys] == "ALL":
+                    custom_config[keys] = MeasureExclude.ALL
+                else:
+                    raise ValueError("invalid measure exclude value in custom config. Enter OUTPUT or NONE")
+
+            if keys == "fp8_config":
+                if custom_config[keys].lower() == "e4m3":
+                    custom_config[keys] = torch.float8_e4m3fn
+
+                elif custom_config[keys].lower() == "e5m2":
+                    custom_config[keys] = torch.float8_e5m2
+                else:
+                    raise ValueError("invalid fp8_config in custom config. Enter E4M3 or E5M2")
+
+            if keys == "scale_method":
+                if custom_config[keys].lower() == "unit_scale":
+                    custom_config[keys] = ScaleMethod.UNIT_SCALE
+                elif custom_config[keys].lower() == "max":
+                    custom_config[keys] = ScaleMethod.MAX
+                elif custom_config[keys].lower() == "maxabs_hw":
+                    custom_config[keys] = ScaleMethod.MAXABS_HW
+                elif custom_config[keys].lower() == "maxabs_pow2":
+                    custom_config[keys] = ScaleMethod.MAXABS_POW2
+                elif custom_config[keys].lower() == "maxabs_hw_opt_weight":
+                    custom_config[keys] = ScaleMethod.MAXABS_HW_OPT_WEIGHT
+                elif custom_config[keys].lower() == "maxabs_pow2_opt_weight":
+                    custom_config[keys] = ScaleMethod.MAXABS_POW2_OPT_WEIGHT
+                elif custom_config[keys].lower() == "smoothquant_weights_output_channel_maxabs_pow2":
+                    custom_config[keys] = ScaleMethod.SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2
+                elif custom_config[keys].lower() == "weaksmoothquant_weights_output_channel_maxabs_pow2":
+                    custom_config[keys] = ScaleMethod.WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2
+                elif custom_config[keys].lower() == "act_maxabs_hw_weights_pcs_maxabs_pow2":
+                    custom_config[keys] = ScaleMethod.ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2
+                elif custom_config[keys].lower() == "act_maxabs_hw_weights_pcs_opt_pow2":
+                    custom_config[keys] = ScaleMethod.ACT_MAXABS_HW_WEIGHTS_PCS_OPT_POW2
+                elif custom_config[keys].lower() == "act_maxabs_pow2_weights_pcs_maxabs_pow2":
+                    custom_config[keys] = ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_MAXABS_POW2
+                elif custom_config[keys].lower() == "act_maxabs_pow2_weights_pcs_opt_pow2":
+                    custom_config[keys] = ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2
+                elif custom_config[keys].lower() == "smoothquant_opt":
+                    custom_config[keys] = ScaleMethod.SMOOTHQUANT_OPT
+                else:
+                    raise ValueError(
+                        f'Invalid fp8_config in custom config ({custom_config[keys]}). should be in ["max", "unit_scale", "maxabs_hw", "maxabs_pow2", "maxabs_per_channel_pow2", "smoothquant_opt"]'
+                    )
+
+            if keys == "ignore_modules_wo_measures":
+                custom_config[keys] = custom_config[keys].lower() == "true"
+
+            # TODO [SW-175936] - remove checking for old key names whitelist and blacklist.
+            if isinstance(custom_config[keys], dict):
+                for keys_2 in custom_config[keys]:
+                    if keys == "whitelist":
+                        measured_global_config["allowlist"][keys_2] = custom_config[keys][keys_2]
+                    elif keys == "blacklist":
+                        measured_global_config["blocklist"][keys_2] = custom_config[keys][keys_2]
+                    else:
+                        measured_global_config[keys][keys_2] = custom_config[keys][keys_2]
+            else:
+                if keys == "whitelist":
+                    measured_global_config["allowlist"] = custom_config[keys]
+                elif keys == "blacklist":
+                    measured_global_config["blocklist"] = custom_config[keys]
+                else:
+                    measured_global_config[keys] = custom_config[keys]
+
+        # If seperate_measure_files is True (default value), then it is assumed that there are multiple distinct measure and scale files
+        # and they are stored in / loaded from paths with the correct index as a suffix. Else, only one is searched for.
+        measured_global_config["local_rank"] = (
+            local_rank if local_rank >= 0 and (custom_config.get("seperate_measure_files", True) == True) else None
+        )
+
+        base_name = measured_global_config["dump_stats_path"].split("/")[-1]
+        folder_name = measured_global_config["dump_stats_path"][: -(len(base_name))]
+        measured_global_config["dump_stats_base_path"] = folder_name
+        os.makedirs(folder_name, exist_ok=True)
+        worker_st = (
+            ""
+            if measured_global_config["local_rank"] == None
+            else "_" + str(measured_global_config["local_rank"]) + "_" + str(measured_global_config["world_size"])
+        )
+        measured_global_config["shape_file"] = measured_global_config["dump_stats_path"] + "_hooks_shape" + worker_st
+        measured_global_config["scale_file"] = (
+            measured_global_config["dump_stats_path"]
+            + "_hooks_"
+            + measured_global_config["observer"]
+            + "_"
+            + measured_global_config["scale_method"].name
+            + worker_st
+        )
+        if (measured_global_config["mode"] == QuantMode.MEASURE) or (
+            measured_global_config["mode"] == QuantMode.QUANTIZE
+        ):
+            measured_global_config["measure_file"] = (
+                measured_global_config["dump_stats_path"] + "_hooks_" + measured_global_config["observer"] + worker_st
+            )
+        # measured_global_config['dump_stats_path'] += '_hooks_.json'
+
+        logger.debug("HQT Paths:")
+        logger.debug("base_name='%s'", base_name)
+        logger.debug("folder_name='%s'", folder_name)
+        logger.debug(
+            "measured_global_config['shape_file']='%s'",
+            measured_global_config["shape_file"],
+        )
+        logger.debug(
+            "measured_global_config['scale_file']='%s'",
+            measured_global_config["scale_file"],
+        )
+        if "measure_file" in measured_global_config.keys():
+            logger.debug(
+                "measured_global_config['measure_file']='%s'",
+                measured_global_config["measure_file"],
+            )
+        logger.debug(
+            "measured_global_config['dump_stats_path']='%s'",
+            measured_global_config["dump_stats_path"],
+        )
+
+        return Fp8cfg(cfg=measured_global_config)
+
+
+def _read_config_from_file(config_path: str) -> Mapping[str, str]:
+    logger.debug("QUANT PACKAGE: using %s config", config_path)
+
+    module_directory = os.path.dirname(os.path.abspath(__file__))
+
+    # if file in absolute path doesn't exist, try looking in cfg directory
+    if not os.path.isfile(config_path):
+        config_path = os.path.join(module_directory, "..", f"custom_config/{config_path}.json")
+    try:
+        logger.info("QUANT PACKAGE: Loading %s", config_path)
+        with open(config_path) as config_json:
+            config = json.load(config_json)
+    except FileNotFoundError as e:
+        raise Exception(f"Got exception: {e}. QUANT PACKAGE: Can't open {config_path}!")
+    except JSONDecodeError as e:
+        config_json.close()
+        raise Exception(f"Got exception: {e}. QUANT PACKAGE: Can't load {config_path} json!")
+    return config
diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py
index 4a603c677ac..ff1dc90a43f 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/common.py
@@ -21,13 +21,15 @@
 
 import torch
 
+from neural_compressor.torch.algorithms.fp8_quant.prepare_quant.prepare_model import finish_measurements
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import Fp8cfg
+
 
 def save_calib_result(model):
-    import habana_quantization_toolkit as hqt
     if (hasattr(model, "__hqt_config__") and
-            isinstance(model.__hqt_config__, hqt._quant_common.quant_config.Fp8cfg)):
+            isinstance(model.__hqt_config__, Fp8cfg)):
         # TODO SW-184714 modify hqt notation to inc notation once code is ported
-        hqt.finish_measurements(model)
+        finish_measurements(model)
     else:
         raise NotImplementedError("Saving calibration results currently supported only in HPU.")
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/custom_config/custom_example.json b/neural_compressor/torch/algorithms/fp8_quant/custom_config/custom_example.json
new file mode 100644
index 00000000000..26b8af220a7
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/custom_config/custom_example.json
@@ -0,0 +1,5 @@
+{
+    "mode": "MEASURE",
+    "scale_method": "MAX",
+    "fp8_config": "E4M3"
+}
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_measure.json b/neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_measure.json
new file mode 100644
index 00000000000..fc675067c22
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_measure.json
@@ -0,0 +1,14 @@
+{
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": []
+    },
+    "quantize_weight": false,
+    "dump_stats_path": "./llama_output/7b_measure"
+}
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_quant.json b/neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_quant.json
new file mode 100644
index 00000000000..f341964187a
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/custom_config/llama_quant.json
@@ -0,0 +1,17 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "quantize_weight": false,
+    "dump_stats_path": "./llama_output/7b_measure"
+}
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/custom_config/measure_config.json b/neural_compressor/torch/algorithms/fp8_quant/custom_config/measure_config.json
new file mode 100755
index 00000000000..b8c4d29b781
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/custom_config/measure_config.json
@@ -0,0 +1,12 @@
+{
+    "mode": "MEASURE",
+    "scale_method": "MAX",
+    "quantize_weight": true,
+    "dump_stats_path": "./run_outputs/fp8/stats",
+    "allowlist": {
+        "types": [
+            "torch.nn.Linear",
+            "torch.nn.Conv2d"
+        ]
+    }
+}
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/custom_config/quant_config.json b/neural_compressor/torch/algorithms/fp8_quant/custom_config/quant_config.json
new file mode 100755
index 00000000000..286a1632257
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/custom_config/quant_config.json
@@ -0,0 +1,13 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "fp8_config": "E4M3",
+    "allowlist": {
+        "types": [
+            "torch.nn.Linear",
+            "torch.nn.Conv2d"
+        ]
+    },
+    "dump_stats_path": "./run_outputs/fp8/stats"
+}
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
index f9ce9145569..bbde53fb417 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
@@ -20,6 +20,7 @@
     restore_patched_module,
     update_mode,
     with_patched_module,
+    prep_model,
 )
 
 
@@ -44,12 +45,10 @@ def convert(self, model):
 
 
 def _convert(model, config_path):
-    import habana_quantization_toolkit as hqt
-
     # update mode to QUANTIZE
     config_path = update_mode(config_path, quant_step=True)
 
-    return hqt.prep_model(model, config_path)
+    return prep_model(model, config_path)
 
 
 def _prepare(model, config_path):
@@ -58,4 +57,4 @@ def _prepare(model, config_path):
     # update mode to MEASURE
     config_path = update_mode(config_path, measure_step=True)
 
-    return hqt.prep_model(model, config_path)
+    return prep_model(model, config_path)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py
new file mode 100644
index 00000000000..8a38f79388b
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py
@@ -0,0 +1,36 @@
+import os
+from typing import Optional
+from .._quant_common.quant_config import Fp8cfg
+from .._core.measure import save_measurements
+from .._core.utils import prepare_model
+from .._quant_common.quant_config import (
+    _read_config_from_file,
+    Fp8cfg,
+    set_hqt_config,
+)
+
+def _prep_model_with_predefined_config(model, *, config: Fp8cfg):
+    set_hqt_config(model, config)
+    prepare_model(model)
+
+
+def prep_model(model, config_path: Optional[str] = None):
+    """
+    Prepare this model with the given (absolute or relative) path of the json file containing the configuration.
+    If `config_path` is not given or `None`,
+    instead perform the legacy behavior of checking for env variable `QUANT_CONFIG`.
+    """
+    if config_path is None:
+        config_path = os.getenv("QUANT_CONFIG")
+        if config_path is None:
+            raise EnvironmentError(
+                "Either pass config_path parameter explicitly (recommended), or set environment variable QUANT_CONFIG"
+            )
+
+    config = _read_config_from_file(config_path=config_path)
+    config = Fp8cfg.parse(config)
+    return _prep_model_with_predefined_config(model, config=config)
+
+
+def finish_measurements(model):
+    save_measurements(model)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/scripts/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json
new file mode 100644
index 00000000000..8409f7ffb47
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json
@@ -0,0 +1,74 @@
+{
+    "bf16": {
+        "winogrande": {
+            "mean": 0.7995,
+            "sem": 0.0112
+        },
+        "hellaswag": {
+            "mean": 0.6529,
+            "sem": 0.0048
+        },
+        "piqa": {
+            "mean": 0.8166,
+            "sem": 0.0090
+        },
+        "lambada_openai": {
+            "mean": 0.7900,
+            "sem": 0.0057
+        }
+    },
+    "fp8": {
+        "ptq": {
+            "winogrande": {
+                "mean": 0.7948,
+                "sem": 0.0113,
+                "mean_diff": -0.0047,
+                "sem_diff": 0.0058
+            },
+            "hellaswag": {
+                "mean": 0.6473,
+                "sem": 0.0048,
+                "mean_diff": 0.0056,
+                "sem_diff": 0.0014
+            },
+            "piqa": {
+                "mean": 0.8134,
+                "sem": 0.0091,
+                "mean_diff": -0.0033,
+                "sem_diff": 0.0034
+            },
+            "lambada_openai": {
+                "mean": 0.7900,
+                "sem": 0.0057,
+                "mean_diff": 0.0000,
+                "sem_diff": 0.0021
+            }
+        },
+        "pcq": {
+            "winogrande": {
+                "mean": 0.8003,
+                "sem": 0.0112,
+                "mean_diff": 0.0008,
+                "sem_diff": 0.0060
+            },
+            "hellaswag": {
+                "mean": 0.6512,
+                "sem": 0.0048,
+                "mean_diff": -0.0017,
+                "sem_diff": 0.0010
+            },
+            "piqa": {
+                "mean": 0.8150,
+                "sem": 0.0091,
+                "mean_diff": -0.0016,
+                "sem_diff": 0.0031
+            },
+            "lambada_openai": {
+                "mean": 0.7920,
+                "sem": 0.0057,
+                "mean_diff": 0.0019,
+                "sem_diff": 0.0021
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
new file mode 100644
index 00000000000..59d609a48dd
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
@@ -0,0 +1,117 @@
+import argparse
+import numpy as np
+import scipy
+import json
+
+tasks = ["winogrande", "hellaswag", "piqa", "lambada_openai"]
+
+
+def ztest(ref_mean=0.0, ref_stderr=1.0, test_mean=0.0, test_stderr=0.0):
+    z_score = (test_mean - ref_mean) / np.sqrt(ref_stderr**2 + test_stderr**2)
+    p_value = 1.0 + scipy.special.erf(-np.abs(z_score) / np.sqrt(2))
+    return p_value
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Regression detection using Z-Test. We assume we have mean and SEM of golden run arranged in json and test results json and we compare the results to see if degregation occurred.",
+    )
+    parser.add_argument(
+        "--hp_dtype",
+        type=str,
+        help="Data type of the high precision test",
+        default=None,
+    )
+    parser.add_argument(
+        "--lp_dtype",
+        type=str,
+        help="Data type of the low precision test",
+        default=None,
+    )
+    parser.add_argument(
+        "--golden_metrics",
+        type=str,
+        help="Path to json that includes mean, SEM and diff golden metrics of bf16 and fp8 precision.",
+        default=None,
+    )
+    parser.add_argument(
+        "--test_metrics_lp",
+        type=str,
+        help="Path to json that includes mean, SEM and diff test metrics of lp precision.",
+        default=None,
+    )
+    parser.add_argument(
+        "--test_metrics_hp",
+        type=str,
+        help="Path to json that includes mean, SEM and diff test metrics of high precision.",
+        default=None,
+    )
+    parser.add_argument("--quantization_mode", type=str, help="quantization mode", default=None)
+    args = parser.parse_args()
+    mode = args.quantization_mode
+    hp_dtype = args.hp_dtype
+    lp_dtype = args.lp_dtype
+    golden_metrics_path = args.golden_metrics
+    test_metrics_lp_path = args.test_metrics_lp
+    test_metrics_hp_path = args.test_metrics_hp
+    if golden_metrics_path is None or test_metrics_hp_path is None or test_metrics_lp_path is None:
+        print("Please provide golden_metrics, test_metrics_hp_path and test_metrics_lp_path json paths")
+        exit(1)
+
+    with open(golden_metrics_path, "r") as f:
+        golden_metrics_json = json.load(f)
+
+    with open(test_metrics_lp_path, "r") as f:
+        test_metrics_lp_json = json.load(f)
+        test_metrics_lp_json = test_metrics_lp_json["results"]
+
+    with open(test_metrics_hp_path, "r") as f:
+        test_metrics_hp_json = json.load(f)
+        test_metrics_hp_json = test_metrics_hp_json["results"]
+
+    regressions = []
+    for task in tasks:
+        # The two-sample z-test comparing the golden and under-test high-precision configuration
+        ref_mean_hp = golden_metrics_json[hp_dtype][task]["mean"]
+        ref_stderr_hp = golden_metrics_json[hp_dtype][task]["sem"]
+        test_mean_hp = test_metrics_hp_json[task]["acc"]
+        test_stderr_hp = test_metrics_hp_json[task]["acc_stderr"]
+        p_hp_value = ztest(ref_mean_hp, ref_stderr_hp, test_mean_hp, test_stderr_hp)
+        print(f"Z-Test high precision p-value={p_hp_value*100:.2f}%  in {task} task")
+        if p_hp_value < 0.05:
+            regressions.append(f"Z-Test high precision p-value is less than 0.05 in {task} task.")
+
+        # The two-sample z-test comparing the golden and under-test low-precision configuration
+        if mode != None:
+            ref_mean_lp = golden_metrics_json[lp_dtype][mode][task]["mean"]
+            ref_stderr_lp = golden_metrics_json[lp_dtype][mode][task]["sem"]
+        else:
+            ref_mean_lp = golden_metrics_json[lp_dtype][task]["mean"]
+            ref_stderr_lp = golden_metrics_json[lp_dtype][task]["sem"]
+        test_mean_lp = test_metrics_lp_json[task]["acc"]
+        test_stderr_lp = test_metrics_lp_json[task]["acc_stderr"]
+        p_lp_value = ztest(ref_mean_lp, ref_stderr_lp, test_mean_lp, test_stderr_lp)
+        print(f"Z-Test low precision p-value={p_lp_value*100:.2f}% in {task} task")
+        if p_lp_value < 0.05:
+            regressions.append(f"Z-Test low precision p-value is less than 0.05 in {task} task.")
+
+        # The single-sample z-test comparing the golden and under-test degradation of low-precision configuration
+        if mode != None:
+            ref_mean_diff = golden_metrics_json[lp_dtype][mode][task]["mean_diff"]
+            ref_stderr_diff = golden_metrics_json[lp_dtype][mode][task]["sem_diff"]
+        else:
+            ref_mean_diff = golden_metrics_json[lp_dtype][task]["mean_diff"]
+            ref_stderr_diff = golden_metrics_json[lp_dtype][task]["sem_diff"]
+        test_mean_diff = test_mean_hp - test_mean_lp
+        p_diff_value = ztest(ref_mean_diff, ref_stderr_diff, test_mean_diff)
+        print(f"Z-Test low precision diff p-value={p_diff_value*100:.2f}% in {task} task")
+        if p_diff_value < 0.05:
+            regressions.append(f"Z-Test low precision diff p-value is less than 0.05 in {task} task.")
+
+    if len(regressions) == 0:
+        print("No regressions were detected!")
+    else:
+        print("Regressions were detected!")
+        for regression in regressions:
+            print(regression)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/utils/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/utils/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/neural_compressor/torch/algorithms/fp8_quant/utils/logger.py b/neural_compressor/torch/algorithms/fp8_quant/utils/logger.py
new file mode 100644
index 00000000000..b4724fe31eb
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/utils/logger.py
@@ -0,0 +1,240 @@
+# Taken and adjusted from neural-compressor-fork/neural_compressor/common/utils/logger.py
+# Should be merged with INC logger once HQT code is inserted into INC
+# TODO: SW-185347 merge INC logger with HQT logger
+"""Logger: handles logging functionalities."""
+
+
+import logging
+from logging.handlers import RotatingFileHandler
+import os
+
+__all__ = ["logger"]
+
+# Define color escape codes
+RESET = "\033[0m"
+BOLD = "\033[1m"
+UNDERLINE = "\033[4m"
+WHITE = "\033[37m"
+BG_RED = "\033[41m"
+RED = "\033[91m"
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+BLUE = "\033[94m"
+PURPLE = "\033[95m"
+CYAN = "\033[96m"
+
+
+def _pretty_dict(value, indent=0):
+    """Make the logger dict pretty."""
+    prefix = "\n" + " " * (indent + 4)
+    if isinstance(value, dict):
+        items = [prefix + repr(key) + ": " + _pretty_dict(value[key], indent + 4) for key in value]
+        return "{%s}" % (",".join(items) + "\n" + " " * indent)
+    elif isinstance(value, list):
+        items = [prefix + _pretty_dict(item, indent + 4) for item in value]
+        return "[%s]" % (",".join(items) + "\n" + " " * indent)
+    elif isinstance(value, tuple):
+        items = [prefix + _pretty_dict(item, indent + 4) for item in value]
+        return "(%s)" % (",".join(items) + "\n" + " " * indent)
+    else:
+        return repr(value)
+
+
+logging.TRACE = 5  # There is no 'trace' level for python logger.
+
+
+def trace(self, msg, *args, **kwargs):
+    """
+    Log 'msg % args' with severity 'TRACE'.
+
+    To pass exception information, use the keyword argument exc_info with
+    a true value, e.g.
+
+    logger.trace("Houston, we have a %s", "thorny problem", exc_info=1)
+    """
+    if self.isEnabledFor(logging.TRACE):
+        self._log(logging.TRACE, msg, args, **kwargs)
+
+
+logging.Logger.trace = trace
+logging.IGNORE = 60
+logging.addLevelName(logging.TRACE, "TRACE")
+logging.__all__ += ["TRACE", "trace"]
+
+log_levels = {
+    "0": logging.TRACE,  # = 5
+    "1": logging.DEBUG,  # = 10
+    "2": logging.INFO,  # = 20
+    "3": logging.WARNING,  # = 30
+    "4": logging.ERROR,  # = 40
+    "5": logging.CRITICAL,  # = 50
+    "6": logging.IGNORE,  # = 60 (Disabling logger)
+}
+MAX_LOG_LEVEL_NAME_LEN = 8
+
+DEFAULT_LOG_FILE_SIZE = 1024 * 1024 * 10
+DEFAULT_LOG_FILE_AMOUNT = 5
+
+
+class _Logger(object):
+    """_Logger class."""
+
+    __instance = None
+
+    def __new__(cls):
+        """Create a singleton _Logger instance."""
+        if _Logger.__instance is None:
+            _Logger.__instance = object.__new__(cls)
+            _Logger.__instance._init_log()
+        return _Logger.__instance
+
+    def get_enable_console_val(self):
+        enableConsole = os.environ.get("ENABLE_CONSOLE", "False").upper()
+        if enableConsole not in ["TRUE", "FALSE"]:
+            raise Exception(f"Env var 'ENABLE_CONSOLE' has to be true or false.")
+        return enableConsole == "TRUE"
+
+    def get_log_level(self):
+        log_level_str = os.environ.get("LOG_LEVEL_HQT", os.environ.get("LOG_LEVEL_ALL"))
+        if log_level_str is None:
+            return logging.INFO
+        if log_level_str not in log_levels:
+            raise Exception(f"Wrong Log Level value: '{log_level_str}'. Must be an integer 0-6.")
+        return log_levels[log_level_str]
+
+    def prepare_logger_format(self):
+        # Time printing is added to format according to the value of PRINT_TIME env var.
+        print_time = os.environ.get("PRINT_TIME", "True")
+        time_format = "" if print_time.upper() in ["0", "FALSE"] else "%(asctime)s.%(msecs)06d"
+        return f"[{time_format}][%(name)s][%(levelname)s] %(message)s"
+
+    # Create a formatter with lower case level name
+    @staticmethod
+    class LowercaseLevelNameFormatter(logging.Formatter):
+        def format(self, record):
+            level_name = record.levelname
+            record.levelname = record.levelname.lower().ljust(MAX_LOG_LEVEL_NAME_LEN)
+            message = super().format(record)
+            record.levelname = level_name
+            return message
+
+    # Create a formatter with color for the console output
+    @staticmethod
+    class ColoredFormatter(logging.Formatter):
+        def format(self, record):
+            message = super().format(record)
+            # if record.levelname == 'TRACE':
+            #   stays black
+            if record.levelname == "DEBUG":
+                style = CYAN
+            elif record.levelname == "INFO":
+                style = GREEN
+            elif record.levelname == "WARNING":
+                style = f"{BOLD}{YELLOW}"
+            elif record.levelname == "ERROR":
+                style = f"{BOLD}{RED}"
+            elif record.levelname == "CRITICAL":
+                style = f"{BG_RED}{BOLD}{WHITE}"
+            else:
+                return message
+            return message.replace(
+                record.levelname,
+                f"{style}{record.levelname.lower().ljust(MAX_LOG_LEVEL_NAME_LEN)}{RESET}",
+                1,
+            )
+
+    def _init_log(self):
+        """Setup the logger format and handler."""
+        enableConsole = self.get_enable_console_val()
+        self._logger = logging.getLogger("HQT")
+        log_level = self.get_log_level()
+        if log_level == logging.IGNORE:
+            self._logger.disabled = True
+        else:
+            # according to: swtools_sdk/hl_logger/src/hllog_core.cpp
+            self._logger.handlers.clear()
+            self._logger.setLevel(log_level)
+            logging_format = self.prepare_logger_format()
+            hls_id = int(os.getenv("HLS_ID", "-1"))
+            local_rank_id = int(os.getenv("ID", os.getenv("OMPI_COMM_WORLD_RANK", "-1")))
+            habana_logs_path = os.getenv("HABANA_LOGS")
+            if habana_logs_path is None:
+                habana_logs_path = (
+                    "/tmp/.habana_logs" if os.getenv("HOME") is None else os.getenv("HOME") + "/.habana_logs"
+                )
+            log_folder = f"{habana_logs_path}{''if hls_id < 0 else '/{}'.format(hls_id)}"
+            log_folder = f"{log_folder}{''if local_rank_id < 0 else '/{}'.format(local_rank_id)}"
+            try:
+                os.makedirs(log_folder, exist_ok=True)
+            except OSError as error:
+                print(
+                    f"Warning: Directory '{log_folder}' can not be created for HQT logs: {error.strerror}. Logger is disabled."
+                )
+                self._logger.disabled = True
+                pass
+            file_path = log_folder + "/hqt_log.txt"
+            log_file_size = int(os.getenv("HQT_LOG_FILE_SIZE", DEFAULT_LOG_FILE_SIZE))
+            if log_file_size < 0:
+                print(
+                    f"Warning: Log file size value is not valid [{log_file_size}]. Using default value [{DEFAULT_LOG_FILE_SIZE}]"
+                )
+                log_file_size = DEFAULT_LOG_FILE_SIZE
+            log_file_amount = int(os.getenv("HQT_LOG_FILE_AMOUNT", DEFAULT_LOG_FILE_AMOUNT))
+            if log_file_amount < 0:
+                print(
+                    f"Warning: Log file amount value is not valid [{log_file_amount}]. Using default value [{DEFAULT_LOG_FILE_AMOUNT}]"
+                )
+                log_file_amount = DEFAULT_LOG_FILE_AMOUNT
+            fileHandler = RotatingFileHandler(
+                file_path, backupCount=log_file_amount, maxBytes=log_file_size
+            )  # default mode = append ("a")
+            formatter = _Logger.LowercaseLevelNameFormatter(logging_format, "%Y-%m-%d %H:%M:%S")
+            fileHandler.setFormatter(formatter)
+            self._logger.addHandler(fileHandler)
+            if enableConsole:
+                import sys
+
+                streamHandler = logging.StreamHandler(sys.stdout)
+                if sys.stdout.isatty():
+                    streamHandler.setFormatter(_Logger.ColoredFormatter(logging_format, "%Y-%m-%d %H:%M:%S"))
+                else:
+                    streamHandler.setFormatter(formatter)
+                self._logger.addHandler(streamHandler)
+            self._logger.propagate = False
+
+    def log(self, func, msg, *args, **kwargs):
+        kwargs.setdefault("stacklevel", 3)
+        if isinstance(msg, dict):
+            for _, line in enumerate(_pretty_dict(msg).split("\n")):
+                func(line, *args, **kwargs)
+        else:
+            func(msg, *args, **kwargs)
+
+    def trace(self, msg, *args, **kwargs):
+        """Output log with the trace level."""
+        self.log(self._logger.trace, msg, *args, **kwargs)
+
+    def debug(self, msg, *args, **kwargs):
+        """Output log with the debug level."""
+        self.log(self._logger.debug, msg, *args, **kwargs)
+
+    def info(self, msg, *args, **kwargs):
+        """Output log with the info level."""
+        self.log(self._logger.info, msg, *args, **kwargs)
+
+    def warning(self, msg, *args, **kwargs):
+        """Output log with the warning level (Alias of the method warn)."""
+        self.log(self._logger.warning, msg, *args, **kwargs)
+
+    def error(self, msg, *args, **kwargs):
+        """Output log with the error level."""
+        self.log(self._logger.error, msg, *args, **kwargs)
+
+    def critical(self, msg, *args, **kwargs):
+        """Output log with the critical level."""
+        self.log(self._logger.critical, msg, *args, **kwargs)
+
+    fatal = critical
+
+
+logger = _Logger()
diff --git a/test/3x/torch/algorithms/fp8_quant/__init__.py b/test/3x/torch/algorithms/fp8_quant/__init__.py
new file mode 100644
index 00000000000..7fec54b4191
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/__init__.py
@@ -0,0 +1,6 @@
+from .tester import run_accuracy_test, TestVector
+
+__all__ = [
+    "run_accuracy_test",
+    "TestVector",
+]
diff --git a/test/3x/torch/algorithms/fp8_quant/conftest.py b/test/3x/torch/algorithms/fp8_quant/conftest.py
new file mode 100644
index 00000000000..3497af8b3f4
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/conftest.py
@@ -0,0 +1,12 @@
+# Called once at the beginning of the test session
+def pytest_sessionstart():
+    import habana_frameworks.torch.core as htcore
+    import torch
+
+    htcore.hpu_set_env()
+
+    # Use reproducible results
+    torch.use_deterministic_algorithms(True)
+
+    # Fix the seed - just in case
+    torch.manual_seed(0)
diff --git a/test/3x/torch/algorithms/fp8_quant/fp8_tests.py b/test/3x/torch/algorithms/fp8_quant/fp8_tests.py
new file mode 100644
index 00000000000..adb9e426409
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/fp8_tests.py
@@ -0,0 +1,174 @@
+import torch
+
+import habana_quantization_toolkit
+import habana_frameworks.torch.core as htcore
+
+# This file is for small tests run for debug flow and accuracy. (Not for CI)
+
+
+class TinyBlock(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyBlock, self).__init__()
+        self.pre_linear = torch.nn.Linear(2, 1, bias=False)
+        self.pre_linear.weight = torch.nn.Parameter(torch.ones([1, 2]))
+
+    def forward(self, x):
+        x = self.pre_linear(x)
+        return x
+
+
+class TinyModel(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel, self).__init__()
+        self.block = TinyBlock()
+
+    def forward(self, x):
+        x = self.block(x)
+        return x
+
+
+class TinyBlock2(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyBlock2, self).__init__()
+        self.pre_linear = torch.nn.Linear(2, 1, bias=False)
+        self.pre_linear.weight = torch.nn.Parameter(torch.ones([1, 2]))
+        self.pre_linear2 = torch.nn.Linear(1, 1, bias=False)
+        self.pre_linear2.weight = torch.nn.Parameter(torch.ones([1, 1]))
+
+    def forward(self, x):
+        x = self.pre_linear(x)
+        x = self.pre_linear2(x)
+        return x
+
+
+class TinyModel2(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel2, self).__init__()
+        self.block = TinyBlock2()
+
+    def forward(self, x):
+        x = self.block(x)
+        return x
+
+
+class TinyModel3(torch.nn.Module):
+
+    def __init__(self):
+        super(TinyModel3, self).__init__()
+        self.block = TinyBlock()
+        self.block2 = TinyBlock2()
+
+    def forward(self, x, b):
+        if b:
+            x = self.block(x)
+        else:
+            x = self.block2(x)
+        return x
+
+
+model = TinyModel()
+model.eval()
+model = model.to("hpu").to(torch.bfloat16)
+htcore.hpu_initialize()
+habana_quantization_toolkit.prep_model(model)  # fp8 additions
+
+
+with torch.no_grad():
+
+    # >>> new_fp8converted_input = (torch.tensor(MaxAbs(input), dtype=torch.bfloat16) / torch.tensor(InputScale, dtype=torch.bfloat16)).to(torch.float8_e4m3fn)
+    # >>> new_fp8converted_weight = (torch.tensor(MaxAbs(weight), dtype=torch.bfloat16) / torch.tensor(WeightScale, dtype=torch.bfloat16)).to(torch.float8_e4m3fn)
+    # >>> mul_result = new_fp8converted_weight.to(torch.bfloat16) * new_fp8converted_input.to(torch.bfloat16)
+    # >>> result = mul_result * torch.tensor(InputScale, dtype=torch.bfloat16) * torch.tensor(WeightScale, dtype=torch.bfloat16)
+
+    # If the results of the first 2 lines > 240 (or nan), assume they are equal to 240. (In G2 or G3 with specific fp8 representation settings)
+
+    # Run simulator:
+    # Gaudi2: run_coral_sim --chip-type gaudi2 -r -D 32
+    # Gaudi3: run_coral_sim --chip-type gaudi3 -r -D 32
+    # cd .../quantization_toolkit/habana_quantization_toolkit/tests/
+
+    # Test1: (Disable (comment) all other tests, delete all files from the test_outputs folder)
+    # Run:
+    # QUANT_CONFIG=test_jsons/test_measure.json python3 fp8_tests.py
+    # QUANT_CONFIG=test_jsons/test_hw_quant.json python3 fp8_tests.py
+    # QUANT_CONFIG=test_jsons/test_pow2_quant.json python3 fp8_tests.py
+    # QUANT_CONFIG=test_jsons/test_unit_quant.json python3 fp8_tests.py
+
+    out_arange = model((torch.tensor([[232, 0]], dtype=torch.bfloat16)).to("hpu"))
+    print(out_arange)
+
+    out_arange = model((torch.tensor([[240, 0]], dtype=torch.bfloat16)).to("hpu"))
+    print(out_arange)
+
+    out_arange = model((torch.tensor([[248, 0]], dtype=torch.bfloat16)).to("hpu"))
+    print(out_arange)
+
+    # Result (Same for Gaudi2 and Gaudi3):
+    # for HW/POW2:
+    # tensor([[224.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[240.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[256.]], device='hpu:0', dtype=torch.bfloat16)
+    # for Unit:
+    # tensor([[224.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[240.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[240.]], device='hpu:0', dtype=torch.bfloat16)
+
+    # Test2: (Disable (comment) all other tests, delete all files from the test_outputs folder)
+    # Run:
+    # QUANT_CONFIG=test_jsons/test_measure.json python3 fp8_tests.py
+    # QUANT_CONFIG=test_jsons/test_hw_quant.json python3 fp8_tests.py
+    # QUANT_CONFIG=test_jsons/test_pow2_quant.json python3 fp8_tests.py
+    # QUANT_CONFIG=test_jsons/test_unit_quant.json python3 fp8_tests.py
+
+    out_arange = model((torch.tensor([[3720, 0]], dtype=torch.bfloat16)).to("hpu"))
+    print(out_arange)
+
+    out_arange = model((torch.tensor([[3721, 0]], dtype=torch.bfloat16)).to("hpu"))
+    print(out_arange)
+
+    out_arange = model((torch.tensor([[13721, 0]], dtype=torch.bfloat16)).to("hpu"))
+    print(out_arange)
+
+    # Result:
+    # for HW (Gaudi2):
+    # tensor([[3584.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[3840.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[3840.]], device='hpu:0', dtype=torch.bfloat16)
+    # for HW (Gaudi3) and Pow2:
+    # tensor([[3584.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[3840.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[13312.]], device='hpu:0', dtype=torch.bfloat16)
+    # for Unit:
+    # tensor([[240.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[240.]], device='hpu:0', dtype=torch.bfloat16)
+    # tensor([[240.]], device='hpu:0', dtype=torch.bfloat16)
+
+    # Test3: (Disable (comment) all other tests, delete all files from the test_outputs folder)
+    #        (Change Line 73 above to: model = TinyModel3())
+    # Run: (add LOG_LEVEL_HQT=0/1 for additional logs)
+    #      (Uncomment lines 164+165)
+    # 1) QUANT_CONFIG=test_jsons/test_measure.json python3 fp8_tests.py
+    # 2) QUANT_CONFIG=test_jsons/test_hw_quant.json python3 fp8_tests.py
+    # 3) QUANT_CONFIG=test_jsons/test_hw_quant_ignored_unmeasured_models.json python3 fp8_tests.py
+    # (Comment lines 164+165, Uncomment lines 166+167)
+    # 4) QUANT_CONFIG=test_jsons/test_hw_quant.json python3 fp8_tests.py
+    # 5) QUANT_CONFIG=test_jsons/test_hw_quant_ignored_unmeasured_models.json python3 fp8_tests.py
+
+    # out_arange = model((torch.tensor([[232, 0]], dtype=torch.bfloat16)).to('hpu'), True)
+    # print(out_arange)
+    # out_arange = model((torch.tensor([[232, 0]], dtype=torch.bfloat16)).to('hpu'), False)
+    # print(out_arange)
+
+    # Result:
+    # 1) tensor([[232.]], device='hpu:0', dtype=torch.bfloat16)
+    # 2) tensor([[224.]], device='hpu:0', dtype=torch.bfloat16)
+    # 3) tensor([[224.]], device='hpu:0', dtype=torch.bfloat16)
+    # 4) Exception: Error - Layer 'block2.pre_linear' was called but was not quantized because no measures were supplied.
+    # 5) tensor([[232.]], device='hpu:0', dtype=torch.bfloat16)
+
+    # fp8 additions
+    habana_quantization_toolkit.finish_measurements(model)
diff --git a/test/3x/torch/algorithms/fp8_quant/pytest.ini b/test/3x/torch/algorithms/fp8_quant/pytest.ini
new file mode 100644
index 00000000000..e081c3c20c8
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    deepspeed: marks tests as deepspeed (deselect with '-m "not deepspeed"')
diff --git a/test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant.json b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant.json
new file mode 100644
index 00000000000..eb4f8e8208e
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant.json
@@ -0,0 +1,16 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
\ No newline at end of file
diff --git a/test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant_ignored_unmeasured_models.json b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant_ignored_unmeasured_models.json
new file mode 100644
index 00000000000..54a779cee7e
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_hw_quant_ignored_unmeasured_models.json
@@ -0,0 +1,17 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test",
+    "ignore_modules_wo_measures": "true"
+}
\ No newline at end of file
diff --git a/test/3x/torch/algorithms/fp8_quant/test_jsons/test_measure.json b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_measure.json
new file mode 100644
index 00000000000..e2743faafa7
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_measure.json
@@ -0,0 +1,13 @@
+{
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": []
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
\ No newline at end of file
diff --git a/test/3x/torch/algorithms/fp8_quant/test_jsons/test_pow2_quant.json b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_pow2_quant.json
new file mode 100644
index 00000000000..7f44824fa9d
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_pow2_quant.json
@@ -0,0 +1,16 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_pow2",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
\ No newline at end of file
diff --git a/test/3x/torch/algorithms/fp8_quant/test_jsons/test_unit_quant.json b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_unit_quant.json
new file mode 100644
index 00000000000..60127bbad20
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/test_jsons/test_unit_quant.json
@@ -0,0 +1,16 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "unit_scale",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
\ No newline at end of file
diff --git a/test/3x/torch/algorithms/fp8_quant/tester.py b/test/3x/torch/algorithms/fp8_quant/tester.py
new file mode 100644
index 00000000000..374c9ada590
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/tester.py
@@ -0,0 +1,218 @@
+from __future__ import annotations
+
+import itertools
+import logging
+import os.path
+import random
+import typing
+from dataclasses import dataclass
+
+import torch
+
+import habana_frameworks as htcore
+
+from habana_quantization_toolkit._core.common import mod_default_dict
+
+from habana_quantization_toolkit._quant_common.quant_config import (
+    Fp8cfg,
+    QuantMode,
+    ScaleMethod,
+)
+
+
+@dataclass
+class TestVector:
+    # Mark to pytest that it is not a tester class
+    __test__ = False
+
+    inputs: typing.Sequence[torch.Tensor]
+    atol: typing.Optional[float] = None
+    rtol: typing.Optional[float] = None
+
+
+M = typing.TypeVar("M", bound=torch.nn.Module)
+
+
+def _assert_quantized_correctly(*, reference_model: WrapModel, quantized_model: WrapModel):
+    """
+    In quantized mode, assert the reference model is not quantized, and the quantized model is.
+    Otherwise, assert that both are not quantized.
+    """
+    for reference_name in mod_default_dict.keys():
+        quantized_name = mod_default_dict[reference_name].patched_module.__name__
+
+        assert not reference_model.has_name(quantized_name)
+        assert not quantized_model.has_name(reference_name), f"{reference_name=} should not be in the quantized model"
+
+        if reference_model.has_name(reference_name):
+            assert quantized_model.has_name(quantized_name), f"{quantized_name=} should be in the quantized model"
+
+
+def run_accuracy_test(
+    *,
+    module_class: typing.Type[M],
+    module_args: typing.Sequence = (),
+    module_kwargs: typing.Mapping = {},
+    lp_dtype: torch.dtype,
+    scale_method: ScaleMethod,
+    measure_vectors: typing.Optional[typing.Iterable[TestVector]] = None,
+    test_vectors: typing.Iterable[TestVector],
+    seed: typing.Optional[int] = None,
+):
+    """
+    Run both the reference and the quantized versions of this module,
+    and compare the outputs on every test vector.
+
+    First the measure vectors are used for measurements.
+
+    This test also makes asserts the quantization actually happened.
+    This may be moved to another tests in the future.
+
+    You can use the generate_test_vectors.py script to generate input test vectors.
+
+    Args:
+        module_class: The reference module class to test.
+            This should be the direct module to test, e.g. Matmul, Linear, etc.
+        module_args: The positional arguments to pass to the module constructor. Default is empty.
+        module_kwargs: The keyword arguments to pass to the module constructor. Default is empty.
+        lp_dtype: The dtype to quantize to.
+        scale_method: The scaling method to use.
+        measure_vectors: An iterable of vectors, each contains a sequence of inputs.
+            If not given, `itertools.tee()` for `test_vectors` will be used.
+            That is, all the test vectors will be used for the measurements.
+        test_vectors: An iterable of test vectors, each contains a sequence of inputs and tolerance
+        seed: The random seed to use. If not given, will use a default seed derived from the module name.
+    """
+
+    # If no measure vectors given - use the same dataset as for the test vectors
+    # Use `help(itertools.tee)` for more info
+    if measure_vectors is None:
+        measure_vectors, test_vectors = itertools.tee(test_vectors)
+
+    for mode in [QuantMode.MEASURE, QuantMode.QUANTIZE]:
+        import habana_quantization_toolkit.prepare_quant.prepare_model as hqt
+
+        reference_model = WrapModel(module_class, seed, *module_args, **module_kwargs)
+        quantized_model = WrapModel(module_class, seed, *module_args, **module_kwargs)
+
+        config = _get_test_only_config(
+            mode=mode,
+            lp_dtype=lp_dtype,
+            scale_method=scale_method,
+        )
+        hqt._prep_model_with_predefined_config(quantized_model, config=config)
+
+        _assert_quantized_correctly(reference_model=reference_model, quantized_model=quantized_model)
+
+        vectors = {
+            QuantMode.MEASURE: measure_vectors,
+            QuantMode.QUANTIZE: test_vectors,
+        }[mode]
+
+        for vector in vectors:
+            reference_output = reference_model(*(input.clone() for input in vector.inputs)).to(float)
+            quantized_output = quantized_model(*(input.clone() for input in vector.inputs)).to(float)
+
+            # Override tolerance values given by the caller
+            tolerance = {
+                key: getattr(vector, key) for key in ["atol", "rtol"] if getattr(vector, key, None) is not None
+            }
+
+            # Accuracy check against the reference module
+            assert torch.allclose(reference_output, quantized_output, **tolerance), (
+                f"Test vector fails in accuracy test: "
+                f"\n  inputs={vector.inputs}"
+                f"\n  {reference_output=}"
+                f"\n  {quantized_output=}"
+                f"\n  {lp_dtype=}"
+                f"\n  {scale_method.name=}"
+            )
+
+        hqt.finish_measurements(quantized_model)
+
+
+def _set_optional_seed(*, module_class: typing.Type[M], seed: typing.Optional[int]):
+    """
+    Set random seed to a unique reproducible value derived from the module.
+
+    Args:
+        module_class: The module class to test.
+            This should be the direct module to test, e.g. Matmul, Linear, etc.
+        seed: The random seed to use. If not given, will use a default seed derived from the module name.
+    """
+    if seed is None:
+        import hashlib
+
+        # We use sha256 to ensure a deterministic has, as opposed to `builtins.hash`, which sadly is not so.
+        seed = int.from_bytes(
+            bytes=hashlib.sha256(module_class.__name__.encode("utf-8")).digest()[:4],
+            byteorder="big",
+        )
+
+    logging.info(f"Using {seed=}")
+
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+
+class WrapModel(torch.nn.Module):
+    """
+    Wrap an inner module.
+    If we do not wrap the inner module, it will not be quantized properly.
+
+    Maybe we can change this behavior in the future.
+    """
+
+    def __init__(
+        self,
+        module_class: typing.Type[M],
+        seed: typing.Optional[int],
+        /,
+        *args,
+        **kwargs,
+    ):
+        super().__init__()
+        _set_optional_seed(module_class=module_class, seed=seed)
+        self.inner = module_class(*args, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        return self.inner(*args, **kwargs)
+
+    def has_name(self, module_name: str) -> bool:
+        return any(module._get_name() == module_name for module in self.modules())
+
+
+TEST_ONLY_OUTPUT_DIRECTORY = f"habana_quantization_toolkit/tests/output/"
+
+
+def get_test_unique_dump_path():
+    # This is a unique id of the test including the parameters, thanks to pytest.
+    # TODO: make sure this globally-ever unique (probably add global init timestamp)
+    unique_test_id = os.environ.get("PYTEST_CURRENT_TEST")
+    return os.path.join(TEST_ONLY_OUTPUT_DIRECTORY, unique_test_id)
+
+
+def _get_test_only_config(
+    *,
+    mode: QuantMode,
+    scale_method: ScaleMethod,
+    lp_dtype: torch.dtype,
+) -> Fp8cfg:
+    """
+    Should NOT be used externally.
+
+    Return a new config used only for the tests.
+    """
+
+    # TODO: replace this with a version that does not use strings but direct values.
+    #  It is currently needed because of how Fp8cfg.parse() works.
+    return Fp8cfg.parse(
+        {
+            "method": "HOOKS",
+            "mode": mode.name,
+            "observer": "maxabs",
+            "fp8_config": str(lp_dtype).replace("torch.float8_", "")[:4],
+            "scale_method": scale_method.name,
+            "dump_stats_path": get_test_unique_dump_path(),
+        }
+    )
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py
new file mode 100644
index 00000000000..2516c4e1ef6
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py
@@ -0,0 +1,6 @@
+"""
+The unit_test package contains a `test_<module>.py` file for every module supported
+in the habana quantization toolkit.
+
+To run use `pytest`.
+"""
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
new file mode 100644
index 00000000000..f0fe3ffcfff
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
@@ -0,0 +1,86 @@
+import os
+import typing
+
+import pytest
+import torch
+
+from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
+
+from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+
+
+class LinearBlock(torch.nn.Module):
+    def __init__(self):
+        super(LinearBlock, self).__init__()
+        self.linear_ = torch.nn.Linear(2, 2, bias=True)
+        self.linear_.weight = torch.nn.Parameter(torch.arange(0.0, 4.0).reshape(2, 2))
+        self.linear_.bias = torch.nn.Parameter(torch.zeros(2))
+
+    def forward(self, x):
+        return self.linear_(x)
+
+
+class TinyBlock(torch.nn.Module):
+    def __init__(self):
+        super(TinyBlock, self).__init__()
+        self.pre_linear = torch.nn.Linear(2, 2, bias=False)
+        self.pre_linear.weight = torch.nn.Parameter(torch.ones((2, 2)) / 4)
+
+        self.linear1 = LinearBlock()
+        self.post_linear = torch.nn.Linear(2, 2, bias=False)
+        self.post_linear.weight = torch.nn.Parameter(torch.ones((2, 2)) / 4)
+        self.linear2 = LinearBlock()
+
+    def forward(self, x):
+        x = self.pre_linear(x)
+        x = self.linear1(x)
+        x = self.post_linear(x)
+        x = self.linear2(x)
+        x = x.sum()
+        return x
+
+
+class TinyModel(torch.nn.Module):
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        block = TinyBlock()
+
+        # no kernel inject - currently only works on Habana's DeepSpeed fork!
+        # these layers will be switched to LinearAllReduce.
+        injection_policy = {TinyBlock: ("linear1.linear_", "linear2.linear_")}
+
+        # Initialize deepspeed on model creation
+        import deepspeed
+        block = deepspeed.init_inference(
+            block,
+            injection_policy=injection_policy,
+            **kwargs,
+        )
+        self.block = block.module
+
+    def forward(self, x):
+        return self.block(x)
+
+
+def get_test_vectors(dtype: torch.dtype) -> typing.Iterable[TestVector]:
+    yield TestVector(
+        inputs=[torch.ones(1, 2).to(device="hpu", dtype=dtype)],
+    )
+
+
+# @pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32])
+# TODO: float32 doesn't work - WHY?
+# TODO: add ticket
+@pytest.mark.deepspeed
+@pytest.mark.parametrize("hp_dtype", [torch.bfloat16])
+@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn])
+def test_deepspeed_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype):
+    world_size = 1
+    run_accuracy_test(
+        module_class=TinyModel,
+        test_vectors=get_test_vectors(dtype=hp_dtype),
+        lp_dtype=lp_dtype,
+        scale_method=ScaleMethod.MAXABS_HW,
+        module_kwargs={"dtype": hp_dtype, "mp_size": world_size},
+    )
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
new file mode 100644
index 00000000000..502aaeb457d
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
@@ -0,0 +1,29 @@
+"""
+Use this module as an example of how to write new unit tests for layers.
+"""
+
+import torch
+
+import habana_quantization_toolkit as hqt
+
+from habana_quantization_toolkit._quant_common.quant_config import QuantMode
+from habana_quantization_toolkit._quant_common.helper_modules import Matmul
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.inner = Matmul()
+
+
+def test_config_json():
+    model = Model()
+
+    for mode in [QuantMode.MEASURE, QuantMode.QUANTIZE]:
+        name = {
+            QuantMode.MEASURE: "measure",
+            QuantMode.QUANTIZE: "quant",
+        }[mode]
+        config_path = f"llama_{name}"
+        hqt.prep_model(model, config_path=config_path)
+        hqt.finish_measurements(model)
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py
new file mode 100644
index 00000000000..97151d54cd8
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py
@@ -0,0 +1,71 @@
+import itertools
+import pytest
+import torch
+
+from typing import Iterable, Tuple
+from habana_quantization_toolkit._core.fp_utils import FP8_143_SCALES
+from habana_quantization_toolkit._quant_common.helper_modules import matmul_fp8
+import habana_frameworks.torch.utils.experimental as htexp
+
+
+def run_test_matmul_fp8(
+    *,
+    hp_dtype: torch.dtype,
+    lp_dtype: torch.dtype,
+    scales: Tuple[float, float],
+):
+    torch.manual_seed(0)
+    x = torch.randn(2, 2, dtype=float).clone()
+    y = torch.randn(2, 2, dtype=float).clone()
+
+    x_scale, y_scale = scales
+    expected_result = (torch.matmul(x, y) / x_scale / y_scale).to(dtype=hp_dtype)
+
+    result = matmul_fp8(
+        input=x.to(device="hpu").to(dtype=lp_dtype),
+        other=y.to(device="hpu").to(dtype=lp_dtype),
+        out_dtype=hp_dtype,
+        scale_input_inv=1 / x_scale,
+        scale_other_inv=1 / y_scale,
+    )
+
+    assert torch.allclose(expected_result, result, rtol=0.1), f"Matmul failed for {x_scale=} {y_scale=}"
+
+
+def get_fp8_143_scales():
+    device_type = htexp._get_device_type()
+    return FP8_143_SCALES[device_type]
+
+
+def get_scales_pairs_not_both_hw_aligned() -> Iterable[Tuple[float, float]]:
+    not_hw_aligned_scales = [0.25]
+
+    return itertools.chain(
+        zip(not_hw_aligned_scales, not_hw_aligned_scales),
+        zip(not_hw_aligned_scales, get_fp8_143_scales()),
+        zip(get_fp8_143_scales(), not_hw_aligned_scales),
+    )
+
+
+def get_scales_pairs_both_hw_aligned() -> Iterable[Tuple[float, float]]:
+    return zip(get_fp8_143_scales(), get_fp8_143_scales())
+
+
+@pytest.mark.parametrize("hp_dtype", [torch.bfloat16])
+@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn])
+def test_matmul_fp8_not_both_hw_aligned(
+    hp_dtype: torch.dtype,
+    lp_dtype: torch.dtype,
+):
+    for scales in get_scales_pairs_not_both_hw_aligned():
+        run_test_matmul_fp8(hp_dtype=hp_dtype, lp_dtype=lp_dtype, scales=scales)
+
+
+@pytest.mark.parametrize("hp_dtype", [torch.bfloat16])
+@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn])
+def test_matmul_fp8_both_hw_aligned(
+    hp_dtype: torch.dtype,
+    lp_dtype: torch.dtype,
+):
+    for scales in get_scales_pairs_both_hw_aligned():
+        run_test_matmul_fp8(hp_dtype=hp_dtype, lp_dtype=lp_dtype, scales=scales)
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
new file mode 100644
index 00000000000..6994bf437ca
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
@@ -0,0 +1,40 @@
+import typing
+
+import pytest
+import torch
+from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
+
+from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+
+
+def get_test_vectors(*, dtype: torch.dtype, C_in: int, H: int, W: int) -> typing.Iterable[TestVector]:
+    yield TestVector(
+        inputs=[torch.ones(1, C_in, H, W, dtype=dtype, device="hpu")],
+        atol=0.2,
+    )
+
+
+@pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn])
+def test_conv2d_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype):
+    C_in = 1
+    C_out = 1
+    K = 3
+
+    H = W = 8
+
+    run_accuracy_test(
+        module_class=torch.nn.Conv2d,
+        module_kwargs={
+            "in_channels": C_in,
+            "out_channels": C_out,
+            "kernel_size": K,
+            "padding": 1,
+            "bias": False,
+            "device": "hpu",
+            "dtype": hp_dtype,
+        },
+        lp_dtype=lp_dtype,
+        scale_method=ScaleMethod.MAXABS_HW,
+        test_vectors=get_test_vectors(dtype=hp_dtype, C_in=C_in, H=H, W=W),
+    )
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
new file mode 100644
index 00000000000..528b5d9358d
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
@@ -0,0 +1,33 @@
+import typing
+
+import pytest
+import torch
+from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
+
+from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+
+
+def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int) -> typing.Iterable[TestVector]:
+    yield TestVector(
+        inputs=[torch.ones(N, D_in, dtype=dtype, device="hpu")],
+        atol=0.02,
+    )
+
+
+@pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn])
+def test_linear_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype):
+    N = 1
+    D_in = 8
+    H = 5
+    run_accuracy_test(
+        module_class=torch.nn.Linear,
+        module_kwargs={
+            "in_features": D_in,
+            "out_features": H,
+            "bias": False,
+        },
+        lp_dtype=lp_dtype,
+        scale_method=ScaleMethod.MAXABS_HW,
+        test_vectors=get_test_vectors(dtype=hp_dtype, N=N, D_in=D_in),
+    )
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py
new file mode 100644
index 00000000000..86ae332b311
--- /dev/null
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py
@@ -0,0 +1,56 @@
+import typing
+
+import pytest
+import torch
+from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
+
+from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+
+
+def get_test_vectors(*, dtype: torch.dtype) -> typing.Iterable[TestVector]:
+    yield TestVector(
+        inputs=[
+            torch.eye(2, dtype=dtype, device="hpu"),
+            torch.eye(2, dtype=dtype, device="hpu"),
+        ],
+        atol=0.2,
+    )
+    yield TestVector(
+        inputs=[
+            torch.randn((2, 2), dtype=dtype, device="hpu"),
+            torch.randn((2, 2), dtype=dtype, device="hpu"),
+        ],
+        atol=0.2,
+    )
+    yield TestVector(
+        inputs=[
+            torch.eye(2, dtype=dtype, device="hpu"),
+            torch.randn((2, 2), dtype=dtype, device="hpu"),
+        ],
+        atol=0.2,
+    )
+
+
+class Matmul(torch.nn.Module):
+    """
+    This is a mimic of other implementations of `Matmul`.
+    It is here to not create a dependency on optimum-habana (which is logically needed).
+    It should not be used directly in user code.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        return torch.matmul(x, y)
+
+
+@pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn])
+def test_matmul_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype):
+    run_accuracy_test(
+        module_class=Matmul,
+        lp_dtype=lp_dtype,
+        scale_method=ScaleMethod.MAXABS_HW,
+        test_vectors=get_test_vectors(dtype=hp_dtype),
+    )

From 96bffd97a94412b43566105dfe0b8007525021dd Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Sun, 7 Jul 2024 18:23:30 +0300
Subject: [PATCH 07/51] [SW-184714] Add internal folder to fp8 quant

This is a folder used for experiments,
not to be used by users

Change-Id: I9e221ae582794e304e95392c0f37638f7bce69bc
---
 .../internal/diffusion_evaluation/README      |   32 +
 .../SR_evaluation/README.md                   |   37 +
 .../SR_evaluation/create_SR_dataset.py        |   87 ++
 .../imagenet1000_clsidx_to_labels.txt         | 1000 +++++++++++++++++
 .../SR_evaluation/super_res_eval.py           |   70 ++
 .../diffusion_evaluation/create_dataset.py    |   90 ++
 .../diffusion_evaluation/evaluator.py         |  102 ++
 .../imagenet_quant.py                         |   75 ++
 .../inference_quant_examples/run_example.sh   |    5 +
 9 files changed, 1498 insertions(+)
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py
 create mode 100644 neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py
 create mode 100755 neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh

diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README
new file mode 100644
index 00000000000..3a71ca76a5e
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README
@@ -0,0 +1,32 @@
+How to calculate FID and clip score:
+
+We will use the MS-COCO database. We use this for two things:
+- Generating a large amount of prompts which we can use to create diffusion images
+- Once we have diffusion images, we need a "ground truth" dataset to calculate the FID.
+
+1) Run a python script which does the following things:
+ - Takes a subset of MSCOCO
+ - Create a CSV with prompts which can then be inserted into the diffusion model. These prompts are taken from captions of the images in the subset 
+ - Create a new folder with the images from the subset
+ - The standard number of images for this evaluation is 30K or 10K
+
+run the following:
+
+python create_dataset.py /datasets/coco2014 <path to save CSV and dataset> <size of subset>
+
+Now, create the generated images from the csv file
+
+IMPORTANT!! - the script that does the actual evaluation (explained below) expects to get an image where the prompt is the title of the image. For example, if the prompt is "a monster playing the guitar" then the name of the file that is created using diffusion should be "<path>/a monster playing the guitar.png" (or jpg or whatever)
+
+IMPORTANT!! #2 - from my experience, stable diffusion inference returns an error for prompts with the character '/' in them. There are very few, around one in a thousand. My recomendation, if you want to evaluate N images, create a subset of the size N+30 and delete prompts with '/' in them.  After creating the CSV I just deleted these prompts manually (takes 10 seconds to do).
+(Perhaps automating this should be a future commit).
+
+2) Now, run the evaluation script. This does the following:
+-	Calculates the CLIP score – takes the CLIP embedding of each generated image and the embedding of the caption that created it (in this case each image and its file name). Then, calculates the cosine distance between them. 
+- Calculates the FID - takes the real and generated images, and calculates according to the FID distance metric. 
+- insert the number of images to evaluate with - could be the number of images in the subset created above or less
+
+To do this, run:
+
+python evaluator.py --device hpu --real_images_path /datasets/coco2014/val2014 --diff_images_path <generated images path> --num_of_images <Num of images to evaluate with>
+
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md
new file mode 100644
index 00000000000..208a9b8d81c
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md
@@ -0,0 +1,37 @@
+How to calculate PSNR and SSIM for Super Resolution
+We will use the Imagenet validation dataset.
+
+The evaluation is done by the following steps:
+1) We take the Imagenet validation set which has 50,000 images (We can also take a subset) 
+2) Crop these Images to be 256*256 (center cropped), and save these images as the "ground truth" dataset. The name of 
+the saved image is its label.
+3) Downsample the images to be 64*64 (using bicubic interpolation) and then restore them using Super Resolution. 
+4) Calculate  PSNR and SSIM between each ground truth image and restored image, and print the mean.
+
+Steps 1,2 and 4 are inluded here, while step 3 (downsampling and restoring) should be done seperately, using the 
+desired Super Resolution method. Keep in mind that this script assumes that the images are stored in a specific format, 
+(detailed later). Later, the restored images path should be given as an input to step 4.
+
+You can skip step 1+2 and use the images at /datasets/imagenet/val_cropped_labeled
+You can also run a python script which does the following to the imagenet validation dataset:
+ - Crops images to 256*256 (this can also be changed using the argument --resize, 256*256 is the default)
+ - Saves the images with the convention <path>/<label>_<ID>.png
+ - a text file mapping imagenet class index to label is needed. It is given here as imagenet1000_clsidx_to_labels.txt, but 
+ can be given as an argument with --class_to_labels
+
+to do this, run the following: 
+
+python create_SR_dataset.py --images <imagenet validation path> --out_dir <path to save ground truth images>
+
+Now, create the generated images so they match the files created above (step 3)
+
+IMPORTANT!! - the script that does the actual evaluation (explained below) expects to get an image where the prompt in the same format 
+<generated images path>/<label>_<ID>.png. This means that the script expects the original and restored images to have the same name.
+
+Find an example in /workdisk/ilamprecht/diffusion/stablediffusionv2/scripts/superres_gen_imgs.py
+
+Now, run the evaluation script, which calculates PSNR and SSIM and prints the mean (step 4)
+
+To do this, run:
+
+python super_res_eval.py --num_images <desired number of images up to 50000> --real_images <real images path> --gen_images <generated images path>
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py
new file mode 100644
index 00000000000..6f037568c10
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py
@@ -0,0 +1,87 @@
+import os
+import torch.nn.parallel
+import torch.optim
+import torch.utils.data
+import torch
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+from torchvision.utils import save_image
+import argparse
+
+from torchvision.transforms import functional as F
+
+
+class CenterCropAndResize(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img):
+        width, height = img.size
+        crop_size = min(width, height)
+        crop = F.center_crop(img, (crop_size, crop_size))
+        resize = F.resize(crop, self.size)
+        return resize
+
+def get_data_loader(path, dataset="ImageNet",
+                    workers=4, shuffle=None, pin_memory=True, resize = 256):
+    
+    #Data loader for ImageNet data.
+
+
+    # defines desired resize amd creates dataset
+    def get_dataset(path_to_data):
+        transformations = [CenterCropAndResize(resize), transforms.ToTensor()]
+        return datasets.ImageFolder(path_to_data, transforms.Compose(transformations))
+
+    # checks if given path is valid  
+    if isinstance(path, str):
+        curr_path = path
+        if not os.path.exists(curr_path):
+            raise FileNotFoundError(f"Directory {curr_path} doesn't exist")
+        data_dir = curr_path
+    elif isinstance(path, list):
+        for path_ in path:
+            if os.path.exists(path_):
+                curr_path = path_
+                break
+        else:
+            raise FileNotFoundError(f"None of the default data directories exist in your env,"
+                                    f" please manually specify one")
+        data_dir = os.path.join(curr_path, 'val')
+    else:
+        raise ValueError("get_data_loader expects list of paths or single path")
+
+    # create dataloader from dataset
+    dataset = get_dataset(data_dir)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=1, shuffle=shuffle,
+        num_workers=workers, pin_memory=pin_memory)
+
+    return data_loader
+
+parser = argparse.ArgumentParser('Create dataset of real images for SR evaluation', add_help=False)
+
+parser.add_argument('--images', type = str, help = 'path to imagenet validation set')
+parser.add_argument('--out_dir', type = str, help = 'path to save images with correct format (cropped + modified file name)')
+parser.add_argument('--resize', type = int, default = 256, help = 'dimensions to resize image')
+parser.add_argument('--class_to_labels', type = str, default = 'imagenet1000_clsidx_to_labels.txt', help = 'path to text file containing' 
+        'mapping between class index and label')
+        
+args = parser.parse_args()
+images = args.images
+out_dir = args.out_dir
+resize = args.resize
+class_to_labels = args.class_to_labels
+
+with torch.no_grad():
+    # get dataloader
+    dl = get_data_loader(images, resize = resize)
+
+    # open idx2label, which matches an integer signifying class with the correcs label
+    idx2label = eval(open(class_to_labels).read())
+
+    # save images with correct filename
+    for i,image in enumerate(dl):
+        label = idx2label.get(image[1].item())
+        save_image(image[0], f'{out_dir}/{label}_{i}.png')
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
new file mode 100644
index 00000000000..2e3ae32a215
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
@@ -0,0 +1,1000 @@
+{0: 'tench, Tinca tinca',
+ 1: 'goldfish, Carassius auratus',
+ 2: 'great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias',
+ 3: 'tiger shark, Galeocerdo cuvieri',
+ 4: 'hammerhead, hammerhead shark',
+ 5: 'electric ray, crampfish, numbfish, torpedo',
+ 6: 'stingray',
+ 7: 'cock',
+ 8: 'hen',
+ 9: 'ostrich, Struthio camelus',
+ 10: 'brambling, Fringilla montifringilla',
+ 11: 'goldfinch, Carduelis carduelis',
+ 12: 'house finch, linnet, Carpodacus mexicanus',
+ 13: 'junco, snowbird',
+ 14: 'indigo bunting, indigo finch, indigo bird, Passerina cyanea',
+ 15: 'robin, American robin, Turdus migratorius',
+ 16: 'bulbul',
+ 17: 'jay',
+ 18: 'magpie',
+ 19: 'chickadee',
+ 20: 'water ouzel, dipper',
+ 21: 'kite',
+ 22: 'bald eagle, American eagle, Haliaeetus leucocephalus',
+ 23: 'vulture',
+ 24: 'great grey owl, great gray owl, Strix nebulosa',
+ 25: 'European fire salamander, Salamandra salamandra',
+ 26: 'common newt, Triturus vulgaris',
+ 27: 'eft',
+ 28: 'spotted salamander, Ambystoma maculatum',
+ 29: 'axolotl, mud puppy, Ambystoma mexicanum',
+ 30: 'bullfrog, Rana catesbeiana',
+ 31: 'tree frog, tree-frog',
+ 32: 'tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui',
+ 33: 'loggerhead, loggerhead turtle, Caretta caretta',
+ 34: 'leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea',
+ 35: 'mud turtle',
+ 36: 'terrapin',
+ 37: 'box turtle, box tortoise',
+ 38: 'banded gecko',
+ 39: 'common iguana, iguana, Iguana iguana',
+ 40: 'American chameleon, anole, Anolis carolinensis',
+ 41: 'whiptail, whiptail lizard',
+ 42: 'agama',
+ 43: 'frilled lizard, Chlamydosaurus kingi',
+ 44: 'alligator lizard',
+ 45: 'Gila monster, Heloderma suspectum',
+ 46: 'green lizard, Lacerta viridis',
+ 47: 'African chameleon, Chamaeleo chamaeleon',
+ 48: 'Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis',
+ 49: 'African crocodile, Nile crocodile, Crocodylus niloticus',
+ 50: 'American alligator, Alligator mississipiensis',
+ 51: 'triceratops',
+ 52: 'thunder snake, worm snake, Carphophis amoenus',
+ 53: 'ringneck snake, ring-necked snake, ring snake',
+ 54: 'hognose snake, puff adder, sand viper',
+ 55: 'green snake, grass snake',
+ 56: 'king snake, kingsnake',
+ 57: 'garter snake, grass snake',
+ 58: 'water snake',
+ 59: 'vine snake',
+ 60: 'night snake, Hypsiglena torquata',
+ 61: 'boa constrictor, Constrictor constrictor',
+ 62: 'rock python, rock snake, Python sebae',
+ 63: 'Indian cobra, Naja naja',
+ 64: 'green mamba',
+ 65: 'sea snake',
+ 66: 'horned viper, cerastes, sand viper, horned asp, Cerastes cornutus',
+ 67: 'diamondback, diamondback rattlesnake, Crotalus adamanteus',
+ 68: 'sidewinder, horned rattlesnake, Crotalus cerastes',
+ 69: 'trilobite',
+ 70: 'harvestman, daddy longlegs, Phalangium opilio',
+ 71: 'scorpion',
+ 72: 'black and gold garden spider, Argiope aurantia',
+ 73: 'barn spider, Araneus cavaticus',
+ 74: 'garden spider, Aranea diademata',
+ 75: 'black widow, Latrodectus mactans',
+ 76: 'tarantula',
+ 77: 'wolf spider, hunting spider',
+ 78: 'tick',
+ 79: 'centipede',
+ 80: 'black grouse',
+ 81: 'ptarmigan',
+ 82: 'ruffed grouse, partridge, Bonasa umbellus',
+ 83: 'prairie chicken, prairie grouse, prairie fowl',
+ 84: 'peacock',
+ 85: 'quail',
+ 86: 'partridge',
+ 87: 'African grey, African gray, Psittacus erithacus',
+ 88: 'macaw',
+ 89: 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
+ 90: 'lorikeet',
+ 91: 'coucal',
+ 92: 'bee eater',
+ 93: 'hornbill',
+ 94: 'hummingbird',
+ 95: 'jacamar',
+ 96: 'toucan',
+ 97: 'drake',
+ 98: 'red-breasted merganser, Mergus serrator',
+ 99: 'goose',
+ 100: 'black swan, Cygnus atratus',
+ 101: 'tusker',
+ 102: 'echidna, spiny anteater, anteater',
+ 103: 'platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus',
+ 104: 'wallaby, brush kangaroo',
+ 105: 'koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus',
+ 106: 'wombat',
+ 107: 'jellyfish',
+ 108: 'sea anemone, anemone',
+ 109: 'brain coral',
+ 110: 'flatworm, platyhelminth',
+ 111: 'nematode, nematode worm, roundworm',
+ 112: 'conch',
+ 113: 'snail',
+ 114: 'slug',
+ 115: 'sea slug, nudibranch',
+ 116: 'chiton, coat-of-mail shell, sea cradle, polyplacophore',
+ 117: 'chambered nautilus, pearly nautilus, nautilus',
+ 118: 'Dungeness crab, Cancer magister',
+ 119: 'rock crab, Cancer irroratus',
+ 120: 'fiddler crab',
+ 121: 'king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica',
+ 122: 'American lobster, Northern lobster, Maine lobster, Homarus americanus',
+ 123: 'spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish',
+ 124: 'crayfish, crawfish, crawdad, crawdaddy',
+ 125: 'hermit crab',
+ 126: 'isopod',
+ 127: 'white stork, Ciconia ciconia',
+ 128: 'black stork, Ciconia nigra',
+ 129: 'spoonbill',
+ 130: 'flamingo',
+ 131: 'little blue heron, Egretta caerulea',
+ 132: 'American egret, great white heron, Egretta albus',
+ 133: 'bittern',
+ 134: 'crane',
+ 135: 'limpkin, Aramus pictus',
+ 136: 'European gallinule, Porphyrio porphyrio',
+ 137: 'American coot, marsh hen, mud hen, water hen, Fulica americana',
+ 138: 'bustard',
+ 139: 'ruddy turnstone, Arenaria interpres',
+ 140: 'red-backed sandpiper, dunlin, Erolia alpina',
+ 141: 'redshank, Tringa totanus',
+ 142: 'dowitcher',
+ 143: 'oystercatcher, oyster catcher',
+ 144: 'pelican',
+ 145: 'king penguin, Aptenodytes patagonica',
+ 146: 'albatross, mollymawk',
+ 147: 'grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus',
+ 148: 'killer whale, killer, orca, grampus, sea wolf, Orcinus orca',
+ 149: 'dugong, Dugong dugon',
+ 150: 'sea lion',
+ 151: 'Chihuahua',
+ 152: 'Japanese spaniel',
+ 153: 'Maltese dog, Maltese terrier, Maltese',
+ 154: 'Pekinese, Pekingese, Peke',
+ 155: 'Shih-Tzu',
+ 156: 'Blenheim spaniel',
+ 157: 'papillon',
+ 158: 'toy terrier',
+ 159: 'Rhodesian ridgeback',
+ 160: 'Afghan hound, Afghan',
+ 161: 'basset, basset hound',
+ 162: 'beagle',
+ 163: 'bloodhound, sleuthhound',
+ 164: 'bluetick',
+ 165: 'black-and-tan coonhound',
+ 166: 'Walker hound, Walker foxhound',
+ 167: 'English foxhound',
+ 168: 'redbone',
+ 169: 'borzoi, Russian wolfhound',
+ 170: 'Irish wolfhound',
+ 171: 'Italian greyhound',
+ 172: 'whippet',
+ 173: 'Ibizan hound, Ibizan Podenco',
+ 174: 'Norwegian elkhound, elkhound',
+ 175: 'otterhound, otter hound',
+ 176: 'Saluki, gazelle hound',
+ 177: 'Scottish deerhound, deerhound',
+ 178: 'Weimaraner',
+ 179: 'Staffordshire bullterrier, Staffordshire bull terrier',
+ 180: 'American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier',
+ 181: 'Bedlington terrier',
+ 182: 'Border terrier',
+ 183: 'Kerry blue terrier',
+ 184: 'Irish terrier',
+ 185: 'Norfolk terrier',
+ 186: 'Norwich terrier',
+ 187: 'Yorkshire terrier',
+ 188: 'wire-haired fox terrier',
+ 189: 'Lakeland terrier',
+ 190: 'Sealyham terrier, Sealyham',
+ 191: 'Airedale, Airedale terrier',
+ 192: 'cairn, cairn terrier',
+ 193: 'Australian terrier',
+ 194: 'Dandie Dinmont, Dandie Dinmont terrier',
+ 195: 'Boston bull, Boston terrier',
+ 196: 'miniature schnauzer',
+ 197: 'giant schnauzer',
+ 198: 'standard schnauzer',
+ 199: 'Scotch terrier, Scottish terrier, Scottie',
+ 200: 'Tibetan terrier, chrysanthemum dog',
+ 201: 'silky terrier, Sydney silky',
+ 202: 'soft-coated wheaten terrier',
+ 203: 'West Highland white terrier',
+ 204: 'Lhasa, Lhasa apso',
+ 205: 'flat-coated retriever',
+ 206: 'curly-coated retriever',
+ 207: 'golden retriever',
+ 208: 'Labrador retriever',
+ 209: 'Chesapeake Bay retriever',
+ 210: 'German short-haired pointer',
+ 211: 'vizsla, Hungarian pointer',
+ 212: 'English setter',
+ 213: 'Irish setter, red setter',
+ 214: 'Gordon setter',
+ 215: 'Brittany spaniel',
+ 216: 'clumber, clumber spaniel',
+ 217: 'English springer, English springer spaniel',
+ 218: 'Welsh springer spaniel',
+ 219: 'cocker spaniel, English cocker spaniel, cocker',
+ 220: 'Sussex spaniel',
+ 221: 'Irish water spaniel',
+ 222: 'kuvasz',
+ 223: 'schipperke',
+ 224: 'groenendael',
+ 225: 'malinois',
+ 226: 'briard',
+ 227: 'kelpie',
+ 228: 'komondor',
+ 229: 'Old English sheepdog, bobtail',
+ 230: 'Shetland sheepdog, Shetland sheep dog, Shetland',
+ 231: 'collie',
+ 232: 'Border collie',
+ 233: 'Bouvier des Flandres, Bouviers des Flandres',
+ 234: 'Rottweiler',
+ 235: 'German shepherd, German shepherd dog, German police dog, alsatian',
+ 236: 'Doberman, Doberman pinscher',
+ 237: 'miniature pinscher',
+ 238: 'Greater Swiss Mountain dog',
+ 239: 'Bernese mountain dog',
+ 240: 'Appenzeller',
+ 241: 'EntleBucher',
+ 242: 'boxer',
+ 243: 'bull mastiff',
+ 244: 'Tibetan mastiff',
+ 245: 'French bulldog',
+ 246: 'Great Dane',
+ 247: 'Saint Bernard, St Bernard',
+ 248: 'Eskimo dog, husky',
+ 249: 'malamute, malemute, Alaskan malamute',
+ 250: 'Siberian husky',
+ 251: 'dalmatian, coach dog, carriage dog',
+ 252: 'affenpinscher, monkey pinscher, monkey dog',
+ 253: 'basenji',
+ 254: 'pug, pug-dog',
+ 255: 'Leonberg',
+ 256: 'Newfoundland, Newfoundland dog',
+ 257: 'Great Pyrenees',
+ 258: 'Samoyed, Samoyede',
+ 259: 'Pomeranian',
+ 260: 'chow, chow chow',
+ 261: 'keeshond',
+ 262: 'Brabancon griffon',
+ 263: 'Pembroke, Pembroke Welsh corgi',
+ 264: 'Cardigan, Cardigan Welsh corgi',
+ 265: 'toy poodle',
+ 266: 'miniature poodle',
+ 267: 'standard poodle',
+ 268: 'Mexican hairless',
+ 269: 'timber wolf, grey wolf, gray wolf, Canis lupus',
+ 270: 'white wolf, Arctic wolf, Canis lupus tundrarum',
+ 271: 'red wolf, maned wolf, Canis rufus, Canis niger',
+ 272: 'coyote, prairie wolf, brush wolf, Canis latrans',
+ 273: 'dingo, warrigal, warragal, Canis dingo',
+ 274: 'dhole, Cuon alpinus',
+ 275: 'African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus',
+ 276: 'hyena, hyaena',
+ 277: 'red fox, Vulpes vulpes',
+ 278: 'kit fox, Vulpes macrotis',
+ 279: 'Arctic fox, white fox, Alopex lagopus',
+ 280: 'grey fox, gray fox, Urocyon cinereoargenteus',
+ 281: 'tabby, tabby cat',
+ 282: 'tiger cat',
+ 283: 'Persian cat',
+ 284: 'Siamese cat, Siamese',
+ 285: 'Egyptian cat',
+ 286: 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
+ 287: 'lynx, catamount',
+ 288: 'leopard, Panthera pardus',
+ 289: 'snow leopard, ounce, Panthera uncia',
+ 290: 'jaguar, panther, Panthera onca, Felis onca',
+ 291: 'lion, king of beasts, Panthera leo',
+ 292: 'tiger, Panthera tigris',
+ 293: 'cheetah, chetah, Acinonyx jubatus',
+ 294: 'brown bear, bruin, Ursus arctos',
+ 295: 'American black bear, black bear, Ursus americanus, Euarctos americanus',
+ 296: 'ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus',
+ 297: 'sloth bear, Melursus ursinus, Ursus ursinus',
+ 298: 'mongoose',
+ 299: 'meerkat, mierkat',
+ 300: 'tiger beetle',
+ 301: 'ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle',
+ 302: 'ground beetle, carabid beetle',
+ 303: 'long-horned beetle, longicorn, longicorn beetle',
+ 304: 'leaf beetle, chrysomelid',
+ 305: 'dung beetle',
+ 306: 'rhinoceros beetle',
+ 307: 'weevil',
+ 308: 'fly',
+ 309: 'bee',
+ 310: 'ant, emmet, pismire',
+ 311: 'grasshopper, hopper',
+ 312: 'cricket',
+ 313: 'walking stick, walkingstick, stick insect',
+ 314: 'cockroach, roach',
+ 315: 'mantis, mantid',
+ 316: 'cicada, cicala',
+ 317: 'leafhopper',
+ 318: 'lacewing, lacewing fly',
+ 319: "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
+ 320: 'damselfly',
+ 321: 'admiral',
+ 322: 'ringlet, ringlet butterfly',
+ 323: 'monarch, monarch butterfly, milkweed butterfly, Danaus plexippus',
+ 324: 'cabbage butterfly',
+ 325: 'sulphur butterfly, sulfur butterfly',
+ 326: 'lycaenid, lycaenid butterfly',
+ 327: 'starfish, sea star',
+ 328: 'sea urchin',
+ 329: 'sea cucumber, holothurian',
+ 330: 'wood rabbit, cottontail, cottontail rabbit',
+ 331: 'hare',
+ 332: 'Angora, Angora rabbit',
+ 333: 'hamster',
+ 334: 'porcupine, hedgehog',
+ 335: 'fox squirrel, eastern fox squirrel, Sciurus niger',
+ 336: 'marmot',
+ 337: 'beaver',
+ 338: 'guinea pig, Cavia cobaya',
+ 339: 'sorrel',
+ 340: 'zebra',
+ 341: 'hog, pig, grunter, squealer, Sus scrofa',
+ 342: 'wild boar, boar, Sus scrofa',
+ 343: 'warthog',
+ 344: 'hippopotamus, hippo, river horse, Hippopotamus amphibius',
+ 345: 'ox',
+ 346: 'water buffalo, water ox, Asiatic buffalo, Bubalus bubalis',
+ 347: 'bison',
+ 348: 'ram, tup',
+ 349: 'bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis',
+ 350: 'ibex, Capra ibex',
+ 351: 'hartebeest',
+ 352: 'impala, Aepyceros melampus',
+ 353: 'gazelle',
+ 354: 'Arabian camel, dromedary, Camelus dromedarius',
+ 355: 'llama',
+ 356: 'weasel',
+ 357: 'mink',
+ 358: 'polecat, fitch, foulmart, foumart, Mustela putorius',
+ 359: 'black-footed ferret, ferret, Mustela nigripes',
+ 360: 'otter',
+ 361: 'skunk, polecat, wood pussy',
+ 362: 'badger',
+ 363: 'armadillo',
+ 364: 'three-toed sloth, ai, Bradypus tridactylus',
+ 365: 'orangutan, orang, orangutang, Pongo pygmaeus',
+ 366: 'gorilla, Gorilla gorilla',
+ 367: 'chimpanzee, chimp, Pan troglodytes',
+ 368: 'gibbon, Hylobates lar',
+ 369: 'siamang, Hylobates syndactylus, Symphalangus syndactylus',
+ 370: 'guenon, guenon monkey',
+ 371: 'patas, hussar monkey, Erythrocebus patas',
+ 372: 'baboon',
+ 373: 'macaque',
+ 374: 'langur',
+ 375: 'colobus, colobus monkey',
+ 376: 'proboscis monkey, Nasalis larvatus',
+ 377: 'marmoset',
+ 378: 'capuchin, ringtail, Cebus capucinus',
+ 379: 'howler monkey, howler',
+ 380: 'titi, titi monkey',
+ 381: 'spider monkey, Ateles geoffroyi',
+ 382: 'squirrel monkey, Saimiri sciureus',
+ 383: 'Madagascar cat, ring-tailed lemur, Lemur catta',
+ 384: 'indri, indris, Indri indri, Indri brevicaudatus',
+ 385: 'Indian elephant, Elephas maximus',
+ 386: 'African elephant, Loxodonta africana',
+ 387: 'lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens',
+ 388: 'giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca',
+ 389: 'barracouta, snoek',
+ 390: 'eel',
+ 391: 'coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch',
+ 392: 'rock beauty, Holocanthus tricolor',
+ 393: 'anemone fish',
+ 394: 'sturgeon',
+ 395: 'gar, garfish, garpike, billfish, Lepisosteus osseus',
+ 396: 'lionfish',
+ 397: 'puffer, pufferfish, blowfish, globefish',
+ 398: 'abacus',
+ 399: 'abaya',
+ 400: "academic gown, academic robe, judge's robe",
+ 401: 'accordion, piano accordion, squeeze box',
+ 402: 'acoustic guitar',
+ 403: 'aircraft carrier, carrier, flattop, attack aircraft carrier',
+ 404: 'airliner',
+ 405: 'airship, dirigible',
+ 406: 'altar',
+ 407: 'ambulance',
+ 408: 'amphibian, amphibious vehicle',
+ 409: 'analog clock',
+ 410: 'apiary, bee house',
+ 411: 'apron',
+ 412: 'ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin',
+ 413: 'assault rifle, assault gun',
+ 414: 'backpack, back pack, knapsack, packsack, rucksack, haversack',
+ 415: 'bakery, bakeshop, bakehouse',
+ 416: 'balance beam, beam',
+ 417: 'balloon',
+ 418: 'ballpoint, ballpoint pen, ballpen, Biro',
+ 419: 'Band Aid',
+ 420: 'banjo',
+ 421: 'bannister, banister, balustrade, balusters, handrail',
+ 422: 'barbell',
+ 423: 'barber chair',
+ 424: 'barbershop',
+ 425: 'barn',
+ 426: 'barometer',
+ 427: 'barrel, cask',
+ 428: 'barrow, garden cart, lawn cart, wheelbarrow',
+ 429: 'baseball',
+ 430: 'basketball',
+ 431: 'bassinet',
+ 432: 'bassoon',
+ 433: 'bathing cap, swimming cap',
+ 434: 'bath towel',
+ 435: 'bathtub, bathing tub, bath, tub',
+ 436: 'beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon',
+ 437: 'beacon, lighthouse, beacon light, pharos',
+ 438: 'beaker',
+ 439: 'bearskin, busby, shako',
+ 440: 'beer bottle',
+ 441: 'beer glass',
+ 442: 'bell cote, bell cot',
+ 443: 'bib',
+ 444: 'bicycle-built-for-two, tandem bicycle, tandem',
+ 445: 'bikini, two-piece',
+ 446: 'binder, ring-binder',
+ 447: 'binoculars, field glasses, opera glasses',
+ 448: 'birdhouse',
+ 449: 'boathouse',
+ 450: 'bobsled, bobsleigh, bob',
+ 451: 'bolo tie, bolo, bola tie, bola',
+ 452: 'bonnet, poke bonnet',
+ 453: 'bookcase',
+ 454: 'bookshop, bookstore, bookstall',
+ 455: 'bottlecap',
+ 456: 'bow',
+ 457: 'bow tie, bow-tie, bowtie',
+ 458: 'brass, memorial tablet, plaque',
+ 459: 'brassiere, bra, bandeau',
+ 460: 'breakwater, groin, groyne, mole, bulwark, seawall, jetty',
+ 461: 'breastplate, aegis, egis',
+ 462: 'broom',
+ 463: 'bucket, pail',
+ 464: 'buckle',
+ 465: 'bulletproof vest',
+ 466: 'bullet train, bullet',
+ 467: 'butcher shop, meat market',
+ 468: 'cab, hack, taxi, taxicab',
+ 469: 'caldron, cauldron',
+ 470: 'candle, taper, wax light',
+ 471: 'cannon',
+ 472: 'canoe',
+ 473: 'can opener, tin opener',
+ 474: 'cardigan',
+ 475: 'car mirror',
+ 476: 'carousel, carrousel, merry-go-round, roundabout, whirligig',
+ 477: "carpenter's kit, tool kit",
+ 478: 'carton',
+ 479: 'car wheel',
+ 480: 'cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM',
+ 481: 'cassette',
+ 482: 'cassette player',
+ 483: 'castle',
+ 484: 'catamaran',
+ 485: 'CD player',
+ 486: 'cello, violoncello',
+ 487: 'cellular telephone, cellular phone, cellphone, cell, mobile phone',
+ 488: 'chain',
+ 489: 'chainlink fence',
+ 490: 'chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour',
+ 491: 'chain saw, chainsaw',
+ 492: 'chest',
+ 493: 'chiffonier, commode',
+ 494: 'chime, bell, gong',
+ 495: 'china cabinet, china closet',
+ 496: 'Christmas stocking',
+ 497: 'church, church building',
+ 498: 'cinema, movie theater, movie theatre, movie house, picture palace',
+ 499: 'cleaver, meat cleaver, chopper',
+ 500: 'cliff dwelling',
+ 501: 'cloak',
+ 502: 'clog, geta, patten, sabot',
+ 503: 'cocktail shaker',
+ 504: 'coffee mug',
+ 505: 'coffeepot',
+ 506: 'coil, spiral, volute, whorl, helix',
+ 507: 'combination lock',
+ 508: 'computer keyboard, keypad',
+ 509: 'confectionery, confectionary, candy store',
+ 510: 'container ship, containership, container vessel',
+ 511: 'convertible',
+ 512: 'corkscrew, bottle screw',
+ 513: 'cornet, horn, trumpet, trump',
+ 514: 'cowboy boot',
+ 515: 'cowboy hat, ten-gallon hat',
+ 516: 'cradle',
+ 517: 'crane',
+ 518: 'crash helmet',
+ 519: 'crate',
+ 520: 'crib, cot',
+ 521: 'Crock Pot',
+ 522: 'croquet ball',
+ 523: 'crutch',
+ 524: 'cuirass',
+ 525: 'dam, dike, dyke',
+ 526: 'desk',
+ 527: 'desktop computer',
+ 528: 'dial telephone, dial phone',
+ 529: 'diaper, nappy, napkin',
+ 530: 'digital clock',
+ 531: 'digital watch',
+ 532: 'dining table, board',
+ 533: 'dishrag, dishcloth',
+ 534: 'dishwasher, dish washer, dishwashing machine',
+ 535: 'disk brake, disc brake',
+ 536: 'dock, dockage, docking facility',
+ 537: 'dogsled, dog sled, dog sleigh',
+ 538: 'dome',
+ 539: 'doormat, welcome mat',
+ 540: 'drilling platform, offshore rig',
+ 541: 'drum, membranophone, tympan',
+ 542: 'drumstick',
+ 543: 'dumbbell',
+ 544: 'Dutch oven',
+ 545: 'electric fan, blower',
+ 546: 'electric guitar',
+ 547: 'electric locomotive',
+ 548: 'entertainment center',
+ 549: 'envelope',
+ 550: 'espresso maker',
+ 551: 'face powder',
+ 552: 'feather boa, boa',
+ 553: 'file, file cabinet, filing cabinet',
+ 554: 'fireboat',
+ 555: 'fire engine, fire truck',
+ 556: 'fire screen, fireguard',
+ 557: 'flagpole, flagstaff',
+ 558: 'flute, transverse flute',
+ 559: 'folding chair',
+ 560: 'football helmet',
+ 561: 'forklift',
+ 562: 'fountain',
+ 563: 'fountain pen',
+ 564: 'four-poster',
+ 565: 'freight car',
+ 566: 'French horn, horn',
+ 567: 'frying pan, frypan, skillet',
+ 568: 'fur coat',
+ 569: 'garbage truck, dustcart',
+ 570: 'gasmask, respirator, gas helmet',
+ 571: 'gas pump, gasoline pump, petrol pump, island dispenser',
+ 572: 'goblet',
+ 573: 'go-kart',
+ 574: 'golf ball',
+ 575: 'golfcart, golf cart',
+ 576: 'gondola',
+ 577: 'gong, tam-tam',
+ 578: 'gown',
+ 579: 'grand piano, grand',
+ 580: 'greenhouse, nursery, glasshouse',
+ 581: 'grille, radiator grille',
+ 582: 'grocery store, grocery, food market, market',
+ 583: 'guillotine',
+ 584: 'hair slide',
+ 585: 'hair spray',
+ 586: 'half track',
+ 587: 'hammer',
+ 588: 'hamper',
+ 589: 'hand blower, blow dryer, blow drier, hair dryer, hair drier',
+ 590: 'hand-held computer, hand-held microcomputer',
+ 591: 'handkerchief, hankie, hanky, hankey',
+ 592: 'hard disc, hard disk, fixed disk',
+ 593: 'harmonica, mouth organ, harp, mouth harp',
+ 594: 'harp',
+ 595: 'harvester, reaper',
+ 596: 'hatchet',
+ 597: 'holster',
+ 598: 'home theater, home theatre',
+ 599: 'honeycomb',
+ 600: 'hook, claw',
+ 601: 'hoopskirt, crinoline',
+ 602: 'horizontal bar, high bar',
+ 603: 'horse cart, horse-cart',
+ 604: 'hourglass',
+ 605: 'iPod',
+ 606: 'iron, smoothing iron',
+ 607: "jack-o'-lantern",
+ 608: 'jean, blue jean, denim',
+ 609: 'jeep, landrover',
+ 610: 'jersey, T-shirt, tee shirt',
+ 611: 'jigsaw puzzle',
+ 612: 'jinrikisha, ricksha, rickshaw',
+ 613: 'joystick',
+ 614: 'kimono',
+ 615: 'knee pad',
+ 616: 'knot',
+ 617: 'lab coat, laboratory coat',
+ 618: 'ladle',
+ 619: 'lampshade, lamp shade',
+ 620: 'laptop, laptop computer',
+ 621: 'lawn mower, mower',
+ 622: 'lens cap, lens cover',
+ 623: 'letter opener, paper knife, paperknife',
+ 624: 'library',
+ 625: 'lifeboat',
+ 626: 'lighter, light, igniter, ignitor',
+ 627: 'limousine, limo',
+ 628: 'liner, ocean liner',
+ 629: 'lipstick, lip rouge',
+ 630: 'Loafer',
+ 631: 'lotion',
+ 632: 'loudspeaker, speaker, speaker unit, loudspeaker system, speaker system',
+ 633: "loupe, jeweler's loupe",
+ 634: 'lumbermill, sawmill',
+ 635: 'magnetic compass',
+ 636: 'mailbag, postbag',
+ 637: 'mailbox, letter box',
+ 638: 'maillot',
+ 639: 'maillot, tank suit',
+ 640: 'manhole cover',
+ 641: 'maraca',
+ 642: 'marimba, xylophone',
+ 643: 'mask',
+ 644: 'matchstick',
+ 645: 'maypole',
+ 646: 'maze, labyrinth',
+ 647: 'measuring cup',
+ 648: 'medicine chest, medicine cabinet',
+ 649: 'megalith, megalithic structure',
+ 650: 'microphone, mike',
+ 651: 'microwave, microwave oven',
+ 652: 'military uniform',
+ 653: 'milk can',
+ 654: 'minibus',
+ 655: 'miniskirt, mini',
+ 656: 'minivan',
+ 657: 'missile',
+ 658: 'mitten',
+ 659: 'mixing bowl',
+ 660: 'mobile home, manufactured home',
+ 661: 'Model T',
+ 662: 'modem',
+ 663: 'monastery',
+ 664: 'monitor',
+ 665: 'moped',
+ 666: 'mortar',
+ 667: 'mortarboard',
+ 668: 'mosque',
+ 669: 'mosquito net',
+ 670: 'motor scooter, scooter',
+ 671: 'mountain bike, all-terrain bike, off-roader',
+ 672: 'mountain tent',
+ 673: 'mouse, computer mouse',
+ 674: 'mousetrap',
+ 675: 'moving van',
+ 676: 'muzzle',
+ 677: 'nail',
+ 678: 'neck brace',
+ 679: 'necklace',
+ 680: 'nipple',
+ 681: 'notebook, notebook computer',
+ 682: 'obelisk',
+ 683: 'oboe, hautboy, hautbois',
+ 684: 'ocarina, sweet potato',
+ 685: 'odometer, hodometer, mileometer, milometer',
+ 686: 'oil filter',
+ 687: 'organ, pipe organ',
+ 688: 'oscilloscope, scope, cathode-ray oscilloscope, CRO',
+ 689: 'overskirt',
+ 690: 'oxcart',
+ 691: 'oxygen mask',
+ 692: 'packet',
+ 693: 'paddle, boat paddle',
+ 694: 'paddlewheel, paddle wheel',
+ 695: 'padlock',
+ 696: 'paintbrush',
+ 697: "pajama, pyjama, pj's, jammies",
+ 698: 'palace',
+ 699: 'panpipe, pandean pipe, syrinx',
+ 700: 'paper towel',
+ 701: 'parachute, chute',
+ 702: 'parallel bars, bars',
+ 703: 'park bench',
+ 704: 'parking meter',
+ 705: 'passenger car, coach, carriage',
+ 706: 'patio, terrace',
+ 707: 'pay-phone, pay-station',
+ 708: 'pedestal, plinth, footstall',
+ 709: 'pencil box, pencil case',
+ 710: 'pencil sharpener',
+ 711: 'perfume, essence',
+ 712: 'Petri dish',
+ 713: 'photocopier',
+ 714: 'pick, plectrum, plectron',
+ 715: 'pickelhaube',
+ 716: 'picket fence, paling',
+ 717: 'pickup, pickup truck',
+ 718: 'pier',
+ 719: 'piggy bank, penny bank',
+ 720: 'pill bottle',
+ 721: 'pillow',
+ 722: 'ping-pong ball',
+ 723: 'pinwheel',
+ 724: 'pirate, pirate ship',
+ 725: 'pitcher, ewer',
+ 726: "plane, carpenter's plane, woodworking plane",
+ 727: 'planetarium',
+ 728: 'plastic bag',
+ 729: 'plate rack',
+ 730: 'plow, plough',
+ 731: "plunger, plumber's helper",
+ 732: 'Polaroid camera, Polaroid Land camera',
+ 733: 'pole',
+ 734: 'police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria',
+ 735: 'poncho',
+ 736: 'pool table, billiard table, snooker table',
+ 737: 'pop bottle, soda bottle',
+ 738: 'pot, flowerpot',
+ 739: "potter's wheel",
+ 740: 'power drill',
+ 741: 'prayer rug, prayer mat',
+ 742: 'printer',
+ 743: 'prison, prison house',
+ 744: 'projectile, missile',
+ 745: 'projector',
+ 746: 'puck, hockey puck',
+ 747: 'punching bag, punch bag, punching ball, punchball',
+ 748: 'purse',
+ 749: 'quill, quill pen',
+ 750: 'quilt, comforter, comfort, puff',
+ 751: 'racer, race car, racing car',
+ 752: 'racket, racquet',
+ 753: 'radiator',
+ 754: 'radio, wireless',
+ 755: 'radio telescope, radio reflector',
+ 756: 'rain barrel',
+ 757: 'recreational vehicle, RV, R.V.',
+ 758: 'reel',
+ 759: 'reflex camera',
+ 760: 'refrigerator, icebox',
+ 761: 'remote control, remote',
+ 762: 'restaurant, eating house, eating place, eatery',
+ 763: 'revolver, six-gun, six-shooter',
+ 764: 'rifle',
+ 765: 'rocking chair, rocker',
+ 766: 'rotisserie',
+ 767: 'rubber eraser, rubber, pencil eraser',
+ 768: 'rugby ball',
+ 769: 'rule, ruler',
+ 770: 'running shoe',
+ 771: 'safe',
+ 772: 'safety pin',
+ 773: 'saltshaker, salt shaker',
+ 774: 'sandal',
+ 775: 'sarong',
+ 776: 'sax, saxophone',
+ 777: 'scabbard',
+ 778: 'scale, weighing machine',
+ 779: 'school bus',
+ 780: 'schooner',
+ 781: 'scoreboard',
+ 782: 'screen, CRT screen',
+ 783: 'screw',
+ 784: 'screwdriver',
+ 785: 'seat belt, seatbelt',
+ 786: 'sewing machine',
+ 787: 'shield, buckler',
+ 788: 'shoe shop, shoe-shop, shoe store',
+ 789: 'shoji',
+ 790: 'shopping basket',
+ 791: 'shopping cart',
+ 792: 'shovel',
+ 793: 'shower cap',
+ 794: 'shower curtain',
+ 795: 'ski',
+ 796: 'ski mask',
+ 797: 'sleeping bag',
+ 798: 'slide rule, slipstick',
+ 799: 'sliding door',
+ 800: 'slot, one-armed bandit',
+ 801: 'snorkel',
+ 802: 'snowmobile',
+ 803: 'snowplow, snowplough',
+ 804: 'soap dispenser',
+ 805: 'soccer ball',
+ 806: 'sock',
+ 807: 'solar dish, solar collector, solar furnace',
+ 808: 'sombrero',
+ 809: 'soup bowl',
+ 810: 'space bar',
+ 811: 'space heater',
+ 812: 'space shuttle',
+ 813: 'spatula',
+ 814: 'speedboat',
+ 815: "spider web, spider's web",
+ 816: 'spindle',
+ 817: 'sports car, sport car',
+ 818: 'spotlight, spot',
+ 819: 'stage',
+ 820: 'steam locomotive',
+ 821: 'steel arch bridge',
+ 822: 'steel drum',
+ 823: 'stethoscope',
+ 824: 'stole',
+ 825: 'stone wall',
+ 826: 'stopwatch, stop watch',
+ 827: 'stove',
+ 828: 'strainer',
+ 829: 'streetcar, tram, tramcar, trolley, trolley car',
+ 830: 'stretcher',
+ 831: 'studio couch, day bed',
+ 832: 'stupa, tope',
+ 833: 'submarine, pigboat, sub, U-boat',
+ 834: 'suit, suit of clothes',
+ 835: 'sundial',
+ 836: 'sunglass',
+ 837: 'sunglasses, dark glasses, shades',
+ 838: 'sunscreen, sunblock, sun blocker',
+ 839: 'suspension bridge',
+ 840: 'swab, swob, mop',
+ 841: 'sweatshirt',
+ 842: 'swimming trunks, bathing trunks',
+ 843: 'swing',
+ 844: 'switch, electric switch, electrical switch',
+ 845: 'syringe',
+ 846: 'table lamp',
+ 847: 'tank, army tank, armored combat vehicle, armoured combat vehicle',
+ 848: 'tape player',
+ 849: 'teapot',
+ 850: 'teddy, teddy bear',
+ 851: 'television, television system',
+ 852: 'tennis ball',
+ 853: 'thatch, thatched roof',
+ 854: 'theater curtain, theatre curtain',
+ 855: 'thimble',
+ 856: 'thresher, thrasher, threshing machine',
+ 857: 'throne',
+ 858: 'tile roof',
+ 859: 'toaster',
+ 860: 'tobacco shop, tobacconist shop, tobacconist',
+ 861: 'toilet seat',
+ 862: 'torch',
+ 863: 'totem pole',
+ 864: 'tow truck, tow car, wrecker',
+ 865: 'toyshop',
+ 866: 'tractor',
+ 867: 'trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi',
+ 868: 'tray',
+ 869: 'trench coat',
+ 870: 'tricycle, trike, velocipede',
+ 871: 'trimaran',
+ 872: 'tripod',
+ 873: 'triumphal arch',
+ 874: 'trolleybus, trolley coach, trackless trolley',
+ 875: 'trombone',
+ 876: 'tub, vat',
+ 877: 'turnstile',
+ 878: 'typewriter keyboard',
+ 879: 'umbrella',
+ 880: 'unicycle, monocycle',
+ 881: 'upright, upright piano',
+ 882: 'vacuum, vacuum cleaner',
+ 883: 'vase',
+ 884: 'vault',
+ 885: 'velvet',
+ 886: 'vending machine',
+ 887: 'vestment',
+ 888: 'viaduct',
+ 889: 'violin, fiddle',
+ 890: 'volleyball',
+ 891: 'waffle iron',
+ 892: 'wall clock',
+ 893: 'wallet, billfold, notecase, pocketbook',
+ 894: 'wardrobe, closet, press',
+ 895: 'warplane, military plane',
+ 896: 'washbasin, handbasin, washbowl, lavabo, wash-hand basin',
+ 897: 'washer, automatic washer, washing machine',
+ 898: 'water bottle',
+ 899: 'water jug',
+ 900: 'water tower',
+ 901: 'whiskey jug',
+ 902: 'whistle',
+ 903: 'wig',
+ 904: 'window screen',
+ 905: 'window shade',
+ 906: 'Windsor tie',
+ 907: 'wine bottle',
+ 908: 'wing',
+ 909: 'wok',
+ 910: 'wooden spoon',
+ 911: 'wool, woolen, woollen',
+ 912: 'worm fence, snake fence, snake-rail fence, Virginia fence',
+ 913: 'wreck',
+ 914: 'yawl',
+ 915: 'yurt',
+ 916: 'web site, website, internet site, site',
+ 917: 'comic book',
+ 918: 'crossword puzzle, crossword',
+ 919: 'street sign',
+ 920: 'traffic light, traffic signal, stoplight',
+ 921: 'book jacket, dust cover, dust jacket, dust wrapper',
+ 922: 'menu',
+ 923: 'plate',
+ 924: 'guacamole',
+ 925: 'consomme',
+ 926: 'hot pot, hotpot',
+ 927: 'trifle',
+ 928: 'ice cream, icecream',
+ 929: 'ice lolly, lolly, lollipop, popsicle',
+ 930: 'French loaf',
+ 931: 'bagel, beigel',
+ 932: 'pretzel',
+ 933: 'cheeseburger',
+ 934: 'hotdog, hot dog, red hot',
+ 935: 'mashed potato',
+ 936: 'head cabbage',
+ 937: 'broccoli',
+ 938: 'cauliflower',
+ 939: 'zucchini, courgette',
+ 940: 'spaghetti squash',
+ 941: 'acorn squash',
+ 942: 'butternut squash',
+ 943: 'cucumber, cuke',
+ 944: 'artichoke, globe artichoke',
+ 945: 'bell pepper',
+ 946: 'cardoon',
+ 947: 'mushroom',
+ 948: 'Granny Smith',
+ 949: 'strawberry',
+ 950: 'orange',
+ 951: 'lemon',
+ 952: 'fig',
+ 953: 'pineapple, ananas',
+ 954: 'banana',
+ 955: 'jackfruit, jak, jack',
+ 956: 'custard apple',
+ 957: 'pomegranate',
+ 958: 'hay',
+ 959: 'carbonara',
+ 960: 'chocolate sauce, chocolate syrup',
+ 961: 'dough',
+ 962: 'meat loaf, meatloaf',
+ 963: 'pizza, pizza pie',
+ 964: 'potpie',
+ 965: 'burrito',
+ 966: 'red wine',
+ 967: 'espresso',
+ 968: 'cup',
+ 969: 'eggnog',
+ 970: 'alp',
+ 971: 'bubble',
+ 972: 'cliff, drop, drop-off',
+ 973: 'coral reef',
+ 974: 'geyser',
+ 975: 'lakeside, lakeshore',
+ 976: 'promontory, headland, head, foreland',
+ 977: 'sandbar, sand bar',
+ 978: 'seashore, coast, seacoast, sea-coast',
+ 979: 'valley, vale',
+ 980: 'volcano',
+ 981: 'ballplayer, baseball player',
+ 982: 'groom, bridegroom',
+ 983: 'scuba diver',
+ 984: 'rapeseed',
+ 985: 'daisy',
+ 986: "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
+ 987: 'corn',
+ 988: 'acorn',
+ 989: 'hip, rose hip, rosehip',
+ 990: 'buckeye, horse chestnut, conker',
+ 991: 'coral fungus',
+ 992: 'agaric',
+ 993: 'gyromitra',
+ 994: 'stinkhorn, carrion fungus',
+ 995: 'earthstar',
+ 996: 'hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa',
+ 997: 'bolete',
+ 998: 'ear, spike, capitulum',
+ 999: 'toilet tissue, toilet paper, bathroom tissue'}
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py
new file mode 100644
index 00000000000..ec2eaeae363
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py
@@ -0,0 +1,70 @@
+from PIL import Image
+import torch
+import glob
+from tqdm import tqdm
+import argparse
+from torchmetrics import PSNR #PeakSignalNoiseRatio as PSNR
+from torchmetrics import SSIM #StructuralSimilarityIndexMeasure as SSIM
+import numpy as np
+from torchvision import transforms
+
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+parser = argparse.ArgumentParser('evaluate Super Resolution using SSIM and PSNR', add_help=False)
+parser.add_argument('--num_images', default=10, type=int, help='number of images to use')
+parser.add_argument('--real_images', type = str, default = '/datasets/imagenet/val_cropped_labeled', help = 'path to real images')
+parser.add_argument('--gen_images', type = str, help = 'path to real images')
+
+args = parser.parse_args()
+num_images = args.num_images
+gen_image_path = args.gen_images + '/*'
+real_image_path = args.real_images + '/*'
+
+# define transform PIL to tensor, used later in loop because 
+# metrics need to receive tensor 
+transform = transforms.Compose([transforms.PILToTensor()])
+
+# import metrics 
+psnr = PSNR().to(device)
+ssim = SSIM().to(device)
+
+# list of metric for each image 
+psnr_distances = []
+ssim_distances = []
+
+# iterate over all of the files in the folders and calculate metric
+files_real = sorted(glob.iglob(real_image_path))
+files_gen = sorted(glob.iglob(gen_image_path))
+
+
+with torch.no_grad():
+    for i in tqdm(np.arange(num_images)):
+        real_image = Image.open(files_real[i])
+        real_image = transform(real_image).to(device).to(torch.float)
+
+        gen_image = Image.open(files_gen[i])
+        gen_image = transform(gen_image).to(device).to(torch.float)
+
+        psnr_res = psnr(real_image, gen_image)
+        psnr_res = psnr_res.item()
+        psnr_distances.append(psnr_res)
+        
+        ssim_res = ssim(torch.unsqueeze(real_image, dim=0),torch.unsqueeze(gen_image, dim=0))
+        ssim_res = ssim_res.item()
+        ssim_distances.append(ssim_res)
+    
+        # to avoid out of memory
+        ssim.reset()
+        psnr.reset()
+
+    # turn list into a numpy array to calcultate average
+    try:
+        psnr_distance = np.array(psnr_distances)
+        ssim_distance = np.array(ssim_distances)
+    except:
+        print(f"error: no files in requested path")
+        quit()
+
+#calculate mean and print
+print(f"mean psnr is {np.mean(psnr_distance)}")
+print(f"mean ssim is {np.mean(ssim_distance)}")
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py
new file mode 100644
index 00000000000..6920e4f96c5
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py
@@ -0,0 +1,90 @@
+# load data
+import json
+import shutil
+import os
+import glob
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--images_path', type=str, help='path to original dataset')
+parser.add_argument('--save_path', type=str, help='path to save images to')
+parser.add_argument('--num_of_images', type=int, help='number of images to save')
+parser.add_argument('--num_of_gpus', type=int, default=1, help='split csv to num_of_gpu files, to train in parallel')
+opt = parser.parse_args()
+
+dataset_path = opt.images_path
+num_of_gpus = opt.num_of_gpus
+save_path = opt.save_path
+n_samples = opt.num_of_images
+
+
+annotations_path = dataset_path + '/annotations/captions_val2017.json'
+images_path = dataset_path + '/val2017/'
+
+annotations_file = json.load(open(annotations_path))
+
+if not os.path.exists(save_path):
+    os.mkdir(save_path)
+    
+save_path_images = save_path + '/subset'
+
+
+# merge images and annotations
+images = annotations_file['images']
+annotations = annotations_file['annotations']
+df = pd.DataFrame(images)
+df_annotations = pd.DataFrame(annotations)
+df = df.merge(pd.DataFrame(annotations), how='left', left_on='id', right_on='image_id')
+
+# keep only the relevant columns
+df = df[['file_name', 'caption']]
+
+
+# shuffle the dataset
+df = df.sample(frac=1)
+
+# remove duplicate images
+df = df.drop_duplicates(subset='file_name')
+
+# sample from data, remove slashes (because prompts with slashes are a problem for SD)
+df_sample = df.sample(n_samples)
+df_sample = df_sample[~df_sample.iloc[:, 1].str.contains('/')]
+
+
+num_per_file = int(np.floor(df_sample.shape[0]/num_of_gpus))
+for i in range(num_of_gpus):
+#save captions to csv
+    #rename old csv
+    save_path_csv = save_path + f"/subset_{i}.csv"
+    if os.path.exists(save_path_csv):
+        num_csv = len(glob.glob(save_path+'/*.csv'))
+        os.rename(save_path_csv,save_path + f'/old_{num_csv}_subset_{i-num_of_gpus}.csv')
+        df_sample['caption'].iloc[num_per_file*i+1:num_per_file*(1+i)+1].to_csv(save_path_csv, index=False,header=False)
+        print("Already found a csv named 'subset'")
+        print(f"Renamed it as '/old_{num_csv}_subset.csv")
+
+    else:
+        df_sample['caption'].iloc[num_per_file*i:num_per_file*(1+i)].to_csv(save_path_csv, index=False,header=False)
+
+print("Saved your new csv to " + save_path)
+
+#clean folder of images
+files = glob.glob(save_path_images+'/*.jpg')
+if os.path.exists(save_path_images):
+    print("Already found a folder named subset")
+    num_subsets = len(list(os.walk(save_path)))
+    os.rename(save_path_images,save_path + f'/old_{num_subsets}_subset')
+    print(f"Renamed it as '/old_{num_subsets}_subset")
+
+
+# copy the images to reference folder
+subset_path = Path(save_path_images)
+subset_path.mkdir(exist_ok=True)
+for i, row in df_sample.iterrows():
+    path = images_path  + row['file_name']
+    shutil.copy(path, subset_path)
+
+print(f"saved your images to " + save_path_images)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py
new file mode 100644
index 00000000000..3a5b1ae3f55
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py
@@ -0,0 +1,102 @@
+from PIL import Image
+import torch
+from transformers import CLIPProcessor, CLIPModel
+from torchvision import transforms
+import random
+import numpy as np
+import argparse
+from torchmetrics.image.fid import FrechetInceptionDistance
+import os
+import glob
+from tqdm import tqdm
+
+torch.manual_seed(0)
+random.seed(0)
+np.random.seed(0)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--device', type=str, default="hpu", help='the device to use', choices=['cpu', 'cuda', 'hpu'])
+parser.add_argument('--real_images_path', type=str, help='path to real images')
+parser.add_argument('--diff_images_path', type=str, help='path to images generated from diffusion')
+parser.add_argument('--num_of_images', type=int, help='number of images to evaluate with')
+opt = parser.parse_args()
+
+real_images_path = opt.real_images_path
+diff_images_path = opt.diff_images_path
+num_of_images = opt.num_of_images
+device = opt.device
+if device == 'hpu':
+    import habana_frameworks.torch.core as core
+
+cosine_sim = torch.nn.CosineSimilarity(dim = 0)
+fid = FrechetInceptionDistance(feature=2048).to(device)
+
+real_images_path = real_images_path + '/*'
+diff_images_path = diff_images_path  +  '/*'
+files = glob.glob(diff_images_path)
+files_real = glob.glob(real_images_path)
+
+files = random.sample(files,num_of_images)
+files_real = random.sample(files_real,num_of_images)
+
+
+
+print("started evaluation")
+# Load clip processor and model
+clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+
+
+
+# Calculate Cosine Distance
+print("finding clip score")
+#Tensor with distance of each image to its label - to be averaged
+# distances = torch.Tensor([]).to(device)
+distances = []
+diff_img_tensor = torch.Tensor([]).to(device)
+# images must be 299*299 for FID calculation
+transform = transforms.Compose([
+    transforms.PILToTensor(), transforms.Resize((299,299))
+])
+
+with torch.no_grad():
+    for i,curr_image_path in tqdm(enumerate(files)):
+        # For clip score
+        images = Image.open(curr_image_path)
+
+        # get caption
+        text = os.path.basename(curr_image_path)
+        text = os.path.splitext(text)[0]
+
+        #process image and text, and embed
+        image_processed = processor(images = images, return_tensors='pt').to(device)
+        image_embedding = torch.squeeze(clip.get_image_features(**image_processed))
+        text_processed = processor(text =text, return_tensors='pt').to(device)
+        text_embedding = torch.squeeze(clip.get_text_features(**text_processed))
+
+        # calculate cosine distance for ith image
+        cosine_dist = torch.Tensor([cosine_sim(image_embedding,text_embedding)]).to(device)
+        distances.append(cosine_dist)
+        # distances = torch.cat((distances,cosine_dist), dim = 0)
+
+        # for FID - register these images as real images
+        reshaped_img = torch.unsqueeze(transform(images),dim = 0).to(device)
+        fid.update(reshaped_img.to(dtype=torch.uint8), real=False)
+
+    distance = torch.Tensor(len(distances)).to(device)
+    torch.cat(distances, out = distance)
+    print(f"mean cosine distange is {torch.mean(distance)}")
+    print("finding FID score")
+
+    # add real images to FID calculation
+    real_images_tensor = torch.Tensor([]).to(device)
+
+    resize_transform = transforms.Resize((299,299))
+    for i,curr_image_path in tqdm(enumerate(files_real)): 
+            images = Image.open(curr_image_path)
+            real_images = torch.unsqueeze(transform(images),dim = 0).to(device)
+            if real_images.shape[1:] == (3,299,299):
+                fid.update(real_images.to(dtype=torch.uint8), real=True)
+
+
+    print(f"The FID is {fid.compute()}")
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py
new file mode 100644
index 00000000000..c6793859ab3
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py
@@ -0,0 +1,75 @@
+import torch
+import torchvision
+import numpy as np 
+import torchvision.transforms as transforms
+import os
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import get_hqt_config, QuantMode
+from neural_compressor.torch.quantization import prepare, convert, finalize_calibration, FP8Config
+
+# fp8 additions
+import neural_compressor
+
+# data
+imgnet_data = '/software/data/pytorch/imagenet/ILSVRC2012/val/'
+transform_test = transforms.Compose([
+    transforms.Resize(256), transforms.CenterCrop(224),
+                               transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
+])
+testset = torchvision.datasets.ImageFolder(imgnet_data, transform_test)
+testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)
+
+
+# Define ResNet-18 model
+model = torchvision.models.quantization.resnet18(pretrained=True)
+# fp8 additions
+
+config_path = os.getenv("QUANT_CONFIG")
+config = FP8Config.from_json_file(config_path)
+if config.measure:
+    model = prepare(model, config)
+elif config.quantize:
+    model = convert(model, config)
+
+quant_config = get_hqt_config(model).cfg
+
+
+# evaluate module
+device = 'hpu'
+model.to(device)
+model.eval()
+
+
+
+def evaluate():
+    accuracy = []
+    max_batches = 10 if quant_config['mode'] == QuantMode.MEASURE else 50
+    for i,(images,labels) in enumerate(testloader):
+        images = images.to(device)
+        labels =labels.to(device)
+        output = model(images)
+        accurate = 0
+        total = 0
+        _,predicted = torch.max(output.data, 1)
+        # total labels
+        total+= labels.size(0)
+        # Total correct predictions
+        accurate += (predicted == labels).sum()
+        accuracy_score = 100 * accurate/total
+        accuracy.append(accuracy_score)
+        if max_batches > 0:
+            max_batches -= 1
+        else:
+            break
+        
+    accuracy = [x.item() for x in accuracy]
+    print(np.mean(np.array(accuracy)))
+
+with torch.no_grad():
+
+    evaluate()
+
+    # fp8 additions
+    if quant_config['mode'] == QuantMode.MEASURE:
+        finalize_calibration(model)
+
+
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh
new file mode 100755
index 00000000000..d843c49050b
--- /dev/null
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh
@@ -0,0 +1,5 @@
+# First, run measurements (based on custom_config/measure_config.json)
+QUANT_CONFIG=measure_config python3 imagenet_quant.py
+
+# Next, run the quantized model (based on custom_config/quant_config.json)
+QUANT_CONFIG=quant_config python3 imagenet_quant.py
\ No newline at end of file

From 90838a4e4edc808f71f4ff4d36188512e89a0af2 Mon Sep 17 00:00:00 2001
From: Danny Semiat <dsemiat@habana.ai>
Date: Thu, 20 Jun 2024 12:27:17 +0300
Subject: [PATCH 08/51] [SW-177468] Removed unused code + cleanup

Change-Id: I4d27c067e87c1a30eb1da9df16a16c46d092c638
---
 .../algorithms/fp8_quant/_core/common.py      | 20 -------------------
 .../algorithms/fp8_quant/_core/measure.py     |  6 +-----
 .../torch/algorithms/fp8_quant/_core/scale.py |  2 +-
 .../fp8_quant/_quant_common/helper_modules.py |  2 +-
 .../fp8_quant/_quant_common/quant_config.py   |  1 -
 .../fp8_quant/unit_tests/test_deepspeed.py    |  1 -
 6 files changed, 3 insertions(+), 29 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index c155146dcc6..d461e513887 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -55,7 +55,6 @@ def __init__(self, num_inputs, param_names, num_outputs, required_output):
 }
 descale_fcn = lambda x, scale: torch.mul(x, scale)
 scale_fcn = lambda x, scale: torch.div(x, scale)
-mat_scale_fcn = lambda x, scale_col, scale_row: torch.div(torch.div(x, scale_col), scale_row)
 cast_fcn = lambda x, dtype: x.to(dtype=dtype)
 cast_to_fp8_fcn = lambda x, dtype, scale_inv=None: torch.ops.hpu.cast_to_fp8_v2(x, scale_inv, False, False, dtype)[0]
 cast_from_fp8_fcn = lambda x, dtype, scale=None: torch.ops.hpu.cast_from_fp8(x, scale, dtype)
@@ -76,25 +75,6 @@ def rec_fn(x, fn):
         return fn(x)
 
 
-def np_to_pt(x):
-    return rec_fn(x, lambda x: torch.tensor(x) if isinstance(x, np.ndarray) else x)
-
-
-def pt_to_np(x):
-    return rec_fn(
-        x,
-        lambda x: (x.detach().cpu().float().numpy() if isinstance(x, torch.Tensor) else x),
-    )
-
-
-def np_to_list(x):
-    return rec_fn(x, lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
-
-
-def list_to_np(x):
-    return rec_fn(x, lambda x: np.array(x) if isinstance(x, list) else x)
-
-
 def save_json(d, fname):
     with open(fname, "w") as f:
         json.dump(d, f, indent=4)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index 39e0e74f666..2a39e1342f5 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -9,6 +9,7 @@
     ScaleMethod,
     MeasureExclude,
     get_hqt_config,
+    set_hqt_config,
 )
 from .common import *
 from ..utils.logger import logger
@@ -229,11 +230,6 @@ def load_measurements(model, fname):
     return d
 
 
-def get_default_config(mod_list):
-    config = {k: "default" for k in mod_list}
-    return config
-
-
 def save_json(d, fname):
     with open(fname, "w") as f:
         json.dump(d, f, indent=4)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
index a85c79b660b..c0b51cd9e74 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
@@ -1,7 +1,7 @@
 import torch
 import numpy as np
 
-from .._quant_common.quant_config import ScaleMethod
+from .._quant_common.quant_config import ScaleMethod, set_hqt_config
 from .scale_methods import *
 from .quant_dequant import *
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 61d26f081ff..ff78685ea16 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -1,7 +1,7 @@
 import torch.nn as nn
 import torch
 
-from .quant_config import QuantMode, get_hqt_config, set_hqt_config
+from .quant_config import QuantMode, get_hqt_config
 
 try:  # backwards compatibility for 1.16
     from habana_frameworks.torch.hpex.kernels import fp8_fused_sdpa
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
index 10c94dea640..901c3eed053 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -13,7 +13,6 @@
 
 local_rank = int(os.getenv("LOCAL_RANK", "-1"))
 world_size = int(os.getenv("WORLD_SIZE", "-1"))
-global_rank = int(os.getenv("RANK", "-1"))
 
 
 class QuantMode(Enum):
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
index f0fe3ffcfff..26939a5b5bf 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
@@ -1,4 +1,3 @@
-import os
 import typing
 
 import pytest

From b76f00247546f0073605c491e7858f634e18e564 Mon Sep 17 00:00:00 2001
From: smarkovichgolan <smarkovich@habana.ai>
Date: Wed, 3 Jul 2024 18:09:30 +0300
Subject: [PATCH 09/51] Fix errors in regression_detection

Change-Id: Iee5318bd5593ba349812516eb5641958ece3c438
---
 .../scripts/regression_detection/golden_metrics.json          | 2 +-
 .../scripts/regression_detection/regression_detection.py      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json
index 8409f7ffb47..179d0738f39 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json
+++ b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/golden_metrics.json
@@ -28,7 +28,7 @@
             "hellaswag": {
                 "mean": 0.6473,
                 "sem": 0.0048,
-                "mean_diff": 0.0056,
+                "mean_diff": -0.0056,
                 "sem_diff": 0.0014
             },
             "piqa": {
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
index 59d609a48dd..3e09123a599 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
@@ -103,8 +103,8 @@ def ztest(ref_mean=0.0, ref_stderr=1.0, test_mean=0.0, test_stderr=0.0):
         else:
             ref_mean_diff = golden_metrics_json[lp_dtype][task]["mean_diff"]
             ref_stderr_diff = golden_metrics_json[lp_dtype][task]["sem_diff"]
-        test_mean_diff = test_mean_hp - test_mean_lp
-        p_diff_value = ztest(ref_mean_diff, ref_stderr_diff, test_mean_diff)
+        test_mean_diff = test_mean_lp - test_mean_hp
+        p_diff_value = ztest(ref_mean_diff, ref_stderr_diff, test_mean_diff, ref_stderr_diff)
         print(f"Z-Test low precision diff p-value={p_diff_value*100:.2f}% in {task} task")
         if p_diff_value < 0.05:
             regressions.append(f"Z-Test low precision diff p-value is less than 0.05 in {task} task.")

From 90b10d3e16f7290f3b29a56d572c326e51ea4971 Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Sun, 23 Jun 2024 11:54:59 +0300
Subject: [PATCH 10/51] [SW-187731] Save orig module as member of patched
 module

This allows direct usage of the original module methods,
which solves torch compile issue

Change-Id: I464d8bd1bacdfc3cd1f128a67114e1e43f092632
---
 .../fp8_quant/_quant_common/helper_modules.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index ff78685ea16..435d4389199 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -108,6 +108,9 @@ def set_attrs_from_orig_model(cls_instance, mod, mod_extra_config, *func_names):
     cls_instance.class_name_org = mod.__class__.__name__
     cls_instance._mod_extra_config = mod_extra_config
     cls_instance.quantization_mode = config.cfg["mode"]
+    # store original module in order to invoke its functions during measurements.
+    # this may be omitted of torch remove the related validation from dynamo. see SW-187731.
+    cls_instance.__dict__["orig_mod"] = mod
     cls_instance.forward_orig = mod.forward
     if func_names is not None:
         for func in func_names:
@@ -164,7 +167,7 @@ def forward(self, input, other):
 
     def forward_measure(self, input, other):
         measure_input((input, other), observer=self._mod_extra_config.inputs)
-        output = self.forward_orig(input, other)
+        output = self.orig_mod(input, other)
         measure_output((output,), self._mod_extra_config.outputs)
         return output
 
@@ -210,7 +213,7 @@ def forward_quant(self, input):
 
     def forward_measure(self, input):
         measure_input((input,), observer=self._mod_extra_config.inputs)
-        output = self.forward_orig(input)
+        output = self.orig_mod(input)
         measure_output((output,), self._mod_extra_config.outputs)
         return output
 
@@ -372,7 +375,7 @@ def forward(self, input):
         )
         dqoutput = self.quant_output(output)
         if self.gather_output:
-            dqoutput = self.collective_func(dqoutput)
+            dqoutput = self.orig_mod.collective_func(dqoutput)
         return self.post_all_reduce(dqoutput)
 
     def forward_measure(self, input):
@@ -380,7 +383,7 @@ def forward_measure(self, input):
         output = torch.matmul(input, self.weight.transpose(-1, -2))
         measure_output((output,), self._mod_extra_config.outputs)
         if self.gather_output:
-            output = self.collective_func(output)
+            output = self.orig_mod.collective_func(output)
         return self.post_all_reduce(output)
 
     def post_all_reduce(self, output):
@@ -563,7 +566,7 @@ def forward(self, input):
 
     def forward_measure(self, input):
         measure_input((input,), observer=self._mod_extra_config.inputs)
-        output = self.forward_orig(input)
+        output = self.orig_mod(input)
         measure_output((output,), self._mod_extra_config.outputs)
         return output
 
@@ -593,7 +596,7 @@ def forward(self, x, dim=None, invAttnHead=None):
 
     def forward_measure(self, x, dim=None, invAttnHead=None):
         measure_input((x,), observer=self._mod_extra_config.inputs)
-        output = self.forward_orig(x, dim, invAttnHead)
+        output = self.orig_mod(x, dim, invAttnHead)
         measure_output((output,), self._mod_extra_config.outputs)
         return output
 
@@ -634,7 +637,7 @@ def forward(self, input, scale: float = 1.0):
 
     def forward_measure(self, input, scale: float = 1.0):
         measure_input((input,), observer=self._mod_extra_config.inputs)
-        output = self.forward_orig(input, scale)
+        output = self.orig_mod(input, scale)
         measure_output((output,), self._mod_extra_config.outputs)
         return output
 
@@ -682,7 +685,7 @@ def forward(self, input, scale: float = 1.0):
 
     def forward_measure(self, input, scale: float = 1.0):
         measure_input((input,), observer=self._mod_extra_config.inputs)
-        output = self.forward_orig(input, scale)
+        output = self.orig_mod(input, scale)
         measure_output((output,), self._mod_extra_config.outputs)
         return output
 

From 62026c2909d39c8d019c43e3492552799f52c18e Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Mon, 8 Jul 2024 11:29:04 +0300
Subject: [PATCH 11/51] [SW-190899] Install packages according to configuration

Change-Id: I570b490658f5d2c5399ba1db93f8f52f56449525
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ebabaa97b78..d3c09c67ee3 100644
--- a/setup.py
+++ b/setup.py
@@ -139,7 +139,7 @@ def get_build_version():
         license="Apache 2.0",
         keywords="quantization",
         url="",
-        packages=find_packages(include=['neural_compressor', 'neural_compressor.*']),
+        packages=include_packages,
         include_package_data=True,
         package_data=package_data,
         install_requires=install_requires,

From 4a0d7049ff83da58a7190f7438f42ec21963e2e9 Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Tue, 9 Jul 2024 22:30:50 +0300
Subject: [PATCH 12/51] [SW-184689] use finalize_calibration intrenaly for one
 step flow

Change-Id: Ie0b8b426c951cf57ed7e6e678c86813fb2d05c89
---
 examples/fp8_sample/sample_one_step.py                    | 1 -
 neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/fp8_sample/sample_one_step.py b/examples/fp8_sample/sample_one_step.py
index 18eb7bfba4c..54a4090a833 100644
--- a/examples/fp8_sample/sample_one_step.py
+++ b/examples/fp8_sample/sample_one_step.py
@@ -47,7 +47,6 @@ def eval_func(model):
         # model.to("hpu")
         output = model(torch.randn(1, 10).to("hpu"))
 
-    finalize_calibration(model)
     model = convert(model)
     print(model)
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
index bbde53fb417..f8311e331d3 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
@@ -21,6 +21,7 @@
     update_mode,
     with_patched_module,
     prep_model,
+    finish_measurements,
 )
 
 
@@ -37,7 +38,8 @@ def prepare(self, model):
         return model
 
     def convert(self, model):
-        if with_patched_module(model):
+        if with_patched_module(model): # if model was calibrated on hpu
+            finish_measurements(model) # dump the measurements into files to be loaded in _convert
             # for INC flow, it calls `prepare` and then `convert` user-facing API in one run
             restore_patched_module(model)
         _convert(model, self.quant_config)

From dfa883370bb76911cf1a3d2b13ff6e4fb76d4835 Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 9 Jul 2024 11:32:29 +0300
Subject: [PATCH 13/51] [SW-191945] align requirement_pt.txt in gerrit INC with
 Github INC

Change-Id: If5c0dbf21bf989af37a8e29246e4f8760cd215ef
Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 requirements_pt.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements_pt.txt b/requirements_pt.txt
index 018b1b9dbf6..c3891a27b99 100644
--- a/requirements_pt.txt
+++ b/requirements_pt.txt
@@ -3,5 +3,6 @@ numpy < 2.0
 peft
 prettytable
 psutil
+py-cpuinfo
 pydantic
 tbb

From 604d66418fa35f361b3f86a9c8bcb0547e29a0a0 Mon Sep 17 00:00:00 2001
From: Uri Livne <ulivne@habana.ai>
Date: Thu, 11 Jul 2024 12:41:09 +0300
Subject: [PATCH 14/51] [SW-192358] Remove HQT reference in INC

Change-Id: Ic25f9323486596fa2dc6d909cd568a37ab84dd5e
---
 neural_compressor/torch/algorithms/fp8_quant/_core/utils.py | 6 +++---
 neural_compressor/torch/algorithms/fp8_quant/common.py      | 2 +-
 neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py   | 2 --
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
index a4652bd1755..8c0a786d53b 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
@@ -13,10 +13,10 @@ def update_mod_dict(config):
 
 def print_init_info(config):
   import importlib.metadata
-  versionStr = importlib.metadata.version('habana_quantization_toolkit')
+  versionStr = importlib.metadata.version('neural_compressor_3x_pt')
   locationStr = versionStr.find('git') + 3
-  logger.info("HQT Git revision = %s", versionStr[locationStr:])
-  logger.info("HQT Configuration = %s", config)
+  logger.info("neural_compressor_3x_pt Git revision = %s", versionStr[locationStr:])
+  logger.info("neural_compressor_3x_pt Configuration = %s", config)
 
 def is_substr(substr_list, target):
   return any([x in target for x in substr_list])
diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py
index ff1dc90a43f..9d32e0efff7 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/common.py
@@ -70,7 +70,7 @@ def create_mod_info_recursion(parent):
 
 
 def get_patched_mod_list():
-    from habana_quantization_toolkit._core.common import mod_default_dict
+    from ._core.common import mod_default_dict
 
     patched_mod_list = []
     for patched_mod in mod_default_dict.values():
diff --git a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
index f8311e331d3..3d4fe19afb2 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
@@ -54,8 +54,6 @@ def _convert(model, config_path):
 
 
 def _prepare(model, config_path):
-    import habana_quantization_toolkit as hqt
-
     # update mode to MEASURE
     config_path = update_mode(config_path, measure_step=True)
 

From a493d7c1be6321357f19489e7fec43ab70438e20 Mon Sep 17 00:00:00 2001
From: Dudi Lester <dlester@habana.ai>
Date: Thu, 11 Jul 2024 15:15:58 +0300
Subject: [PATCH 15/51] [SW-191415] update fp8 maxAbs observer  using
 torch.copy_

Change-Id: I3923c832f9a8a2b14e392f3f4719d233a457702f
---
 neural_compressor/torch/algorithms/fp8_quant/_core/measure.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index 2a39e1342f5..ae0eb5423b5 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -261,8 +261,7 @@ def init_state_from_shape(self, x_shape, device="hpu"):
         return state
 
     def update_state(self, x):
-        # TODO: [SW-189690] Find better way to update self.state in MaxAbsObserver class in HQT
-        self.state = torch.maximum(torch.max(torch.abs(x)), self.state)
+        self.state.copy_(torch.maximum(torch.max(torch.abs(x)), self.state))
 
     def measure(self, x):
         if self.first:

From 88038087c3986127fb4ac635b89eccb51abd43fa Mon Sep 17 00:00:00 2001
From: Zhou Yuwen <zyuwen@habana.ai>
Date: Wed, 12 Jun 2024 18:49:17 -0700
Subject: [PATCH 16/51] [SW-184943] Enhance INC WOQ model loading

- Support loading huggingface WOQ model
- Abstract WeightOnlyLinear base class. Add INCWeightOnlyLinear and HPUWeighOnlyLinear subclasses
- Load woq linear weight module by module
- Save hpu format tensor to reuse it once load it again

Change-Id: I679a42759b49e1f45f52bbb0bdae8580a23d0bcf
---
 .../torch/algorithms/weight_only/gptq.py      |   4 +-
 .../torch/algorithms/weight_only/modules.py   | 179 +++-
 .../torch/algorithms/weight_only/rtn.py       |   4 +-
 .../torch/algorithms/weight_only/save_load.py | 821 +++++++++++-------
 .../torch/quantization/load_entry.py          |  22 +-
 neural_compressor/torch/utils/utility.py      |   5 +-
 .../weight_only/test_autoround.py             |   6 +-
 .../quantization/weight_only/test_awq.py      |  29 +-
 .../quantization/weight_only/test_gptq.py     |  26 +-
 .../quantization/weight_only/test_load.py     | 109 +++
 .../quantization/weight_only/test_rtn.py      |  52 +-
 11 files changed, 875 insertions(+), 382 deletions(-)
 create mode 100644 test/3x/torch/quantization/weight_only/test_load.py

diff --git a/neural_compressor/torch/algorithms/weight_only/gptq.py b/neural_compressor/torch/algorithms/weight_only/gptq.py
index 43bf5061bfa..fa50b64e86b 100644
--- a/neural_compressor/torch/algorithms/weight_only/gptq.py
+++ b/neural_compressor/torch/algorithms/weight_only/gptq.py
@@ -37,7 +37,7 @@
 )
 from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator
 
-from .modules import WeightOnlyLinear
+from .modules import INCWeightOnlyLinear
 
 if is_transformers_imported():
     import transformers
@@ -665,7 +665,7 @@ def tmp(_, inp, out):
                 if not self.use_layer_wise:
                     bias = sub_layers[layer_name].bias
 
-                new_module = WeightOnlyLinear(
+                new_module = INCWeightOnlyLinear(
                     in_features,
                     out_features,
                     dtype=weight_config_this_layer["dtype"],
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 0a0c8a54f52..3efe65189f1 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -18,6 +18,7 @@
 # Note: Do not import this file unless you have already imported torch,
 # since the model classes inherit torch.nn.Module.
 import math
+from abc import abstractmethod
 
 import numba
 import numpy as np
@@ -51,9 +52,64 @@ def forward(self, X):
         return X
 
 
+class UnpackedWeightOnlyLinearParams(dict):
+    def __init__(self, unpack_weight, scales, unpack_zp, **kwargs):
+        super().__init__(
+            int_weight=unpack_weight,
+            scales=scales,
+            zp=unpack_zp,
+            **kwargs)
+
+    def to(self, device):
+        for key, value in self.items():
+            if isinstance(value, torch.Tensor) and value is not None:
+                self[key] = value.to(device)
+        return self
+
 class WeightOnlyLinear(torch.nn.Module):
     """Weight Only Linear."""
 
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        dtype,
+        bits,
+        group_size,
+        device,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.dtype = dtype
+        self.bits = bits
+        self.group_size = group_size if group_size != -1 else in_features
+        self.device = device
+
+    @abstractmethod
+    def pack(self, *args, **kwargs):
+        raise NotImplementedError("{} doesn't implement `pack` function. ".format(self.__class__.__name__))
+
+    @abstractmethod
+    def unpack(self, *args, **kwargs):
+        raise NotImplementedError("{} doesn't implement `unpack` function. ".format(self.__class__.__name__))
+
+    @abstractmethod
+    def forward(self, input):
+        raise NotImplementedError("{} doesn't implement `forward` function. ".format(self.__class__.__name__))
+
+    def extra_repr(self) -> str:
+        tmp_str = "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format(
+            self.in_features,
+            self.out_features,
+            self.bits,
+            self.group_size,
+            self.bias is not None,
+        )
+        return tmp_str
+
+
+class INCWeightOnlyLinear(WeightOnlyLinear):
     def __init__(
         self,
         in_features,
@@ -69,6 +125,7 @@ def __init__(
         g_idx=False,
         device="cpu",
         use_optimum_format=True,
+        **kwargs,
     ):
         """Init the WeightOnlyLinear object.
 
@@ -95,9 +152,15 @@ def __init__(
                 4. parameter name changed, such as 'packed_weight' -> 'qweight'.
                 5. zeros is always needed even for sym.
         """
-        super().__init__()
+        super(INCWeightOnlyLinear, self).__init__(
+            in_features,
+            out_features,
+            dtype,
+            bits,
+            group_size,
+            device,
+        )
         self.use_optimum_format = use_optimum_format
-        self.dtype = dtype
         if "int" not in self.dtype:  # for nf4, fp4
             from neural_compressor.torch.algorithms.weight_only.utility import FLOAT_MAPPING, INT_MAPPING
 
@@ -107,18 +170,13 @@ def __init__(
             self.int2float_mapping = {}
             for k, v in zip(int_list, float_list):
                 self.int2float_mapping[k] = v
-        self.bits = bits
-        self.device = device
-        self.in_features = in_features
-        self.out_features = out_features
-        self.group_size = group_size if group_size != -1 else in_features
         self.compression_dim = compression_dim
         assert compression_dtype in [
             torch.int8,
             torch.int16,
             torch.int32,
             torch.int64,
-        ], "Only support torch.int8|16|32|64 as compressed dtype."
+        ], f"Only support torch.int8|16|32|64 as compressed dtype. but got {compression_dtype}"
         dtype_bits_mapping = {torch.int8: 8, torch.int16: 16, torch.int32: 32, torch.int64: 64}
         self.compress_bits = dtype_bits_mapping[compression_dtype]
         self.n_pack = self.compress_bits // self.bits
@@ -202,7 +260,7 @@ def __init__(
         else:
             self.g_idx = None
 
-    def pack(self, int_weight, scale, zp, bias, g_idx=None):
+    def pack(self, int_weight, scales, zp, bias=None, g_idx=None, **kwargs):
         """Pack int weight."""
         if self.use_optimum_format:
             self.scales = self.scales.T.contiguous()
@@ -215,7 +273,7 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
             int_weight = int_weight.type(torch.int32)
             shift_bias = 2 ** (self.bits - 1)
             int_weight += shift_bias
-            zp = torch.zeros_like(scale, dtype=torch.uint8) + shift_bias
+            zp = torch.zeros_like(scales, dtype=torch.uint8) + shift_bias
         if bias is not None:
             assert hasattr(self, "bias"), "bias is not set when initializing."
             self.bias = bias.type(self.float_type).to(self.device)
@@ -226,8 +284,8 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
                 invperm = torch.argsort(self.g_idx)
                 self.g_idx = invperm // self.group_size
                 self.g_idx = self.g_idx.type(torch.int32).to(self.device)
-        assert scale.shape == self.scales.shape, f"{scale.shape} != {self.scales.shape} Scale shape is mismatched."
-        self.scales = scale.type(self.float_type).to(self.device)
+        assert scales.shape == self.scales.shape, f"{scales.shape} != {self.scales.shape} Scale shape is mismatched."
+        self.scales = scales.type(self.float_type).to(self.device)
         if not self.use_optimum_format and self.compression_dim == 0:
             int_weight = int_weight.T.contiguous()
             self.qweight = self.qweight.T.contiguous()
@@ -256,17 +314,16 @@ def pack(self, int_weight, scale, zp, bias, g_idx=None):
             self.qweight = self.qweight.T.contiguous()
             self.qzeros = self.qzeros.T.contiguous()
 
-    def recover(self):
-        """Recover fp32 weight from packed weight."""
-        logger.debug(f"Recovering {self} weight")
+    def unpack(self):
         scales = self.scales.T.contiguous() if self.use_optimum_format else self.scales
         qweight = self.qweight.T.contiguous() if self.use_optimum_format else self.qweight
 
         device = scales.device
-        fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
         if self.g_idx is None:
             # used for recovering fp32_weight
-            self.g_idx = torch.tensor([i // self.group_size for i in range(self.in_features)], dtype=torch.int32)
+            self.g_idx = torch.tensor([i // self.group_size for i in range(self.in_features)], dtype=torch.int32).to(
+                device
+            )
         # unpack weight
         if not self.use_optimum_format and self.compression_dim == 0:
             qweight = qweight.T.contiguous()
@@ -279,7 +336,9 @@ def recover(self):
             for k, v in self.int2float_mapping.items():
                 new_weight += torch.where(weight == k, v, 0)
             weight = new_weight
+
         # unpack zero_point
+        zp = None
         if hasattr(self, "qzeros"):
             qzeros = self.qzeros.T.contiguous() if self.use_optimum_format else self.qzeros
             if self.use_optimum_format or self.compression_dim == 0:
@@ -292,6 +351,21 @@ def recover(self):
                 # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
                 zp += 1
                 zp = torch.where(zp > (2**self.bits - 1), 0, zp)
+        return UnpackedWeightOnlyLinearParams(weight, scales, zp, g_idx=self.g_idx, bias=self.bias)
+
+    def recover(self):
+        logger.debug(f"Recovering {self} weight")
+        unpack_params_dict = self.unpack()
+        weight = unpack_params_dict.get("int_weight")
+        scales = unpack_params_dict.get("scales")
+        zp = unpack_params_dict.get("zp")
+
+        device = scales.device
+
+        fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
+
+        # recover fp32 weight
+        if zp is not None:
             # recover fp32 weight with int_weight, scale, and zero_point
             for idx in range(self.in_features):
                 fp32_weight[:, idx] = (torch.subtract(weight[:, idx], zp[:, self.g_idx[idx]]).to(torch.int8)) * scales[
@@ -301,7 +375,37 @@ def recover(self):
             # recover fp32 weight with int_weight, scale
             for idx in range(self.in_features):
                 fp32_weight[:, idx] = weight[:, idx] * scales[:, self.g_idx[idx]]
-        return fp32_weight
+
+        return fp32_weight.to(scales.device)
+
+    def forward(self, input):
+        if not hasattr(self, "weight"):
+            weight = self.recover()
+            device = self.scales.device
+            if weight.dtype == torch.float16 and device.type == "cpu":
+                weight = weight.float()
+                self.bias = self.bias.float() if self.bias is not None else None
+        if True:  # keep reusing self.weight due to recover is too slow.
+            if not hasattr(self, "weight"):
+                self.weight = weight
+            input = input.type(self.weight.dtype)
+            logger.debug(f"Calculating {self}")
+            return F.linear(input, self.weight, self.bias)
+        else:
+            input = input.type(weight.dtype)
+            return F.linear(input, weight, self.bias)
+
+    def pack_tensor(self, raw_tensor):
+        if "cuda" in self.device:
+            return self.pack_tensor_with_torch(raw_tensor)
+        else:
+            return self.pack_tensor_with_numpy(raw_tensor)
+
+    def unpack_tensor(self, packed_tensor):
+        if "cuda" in self.device:
+            return self.unpack_tensor_with_torch(packed_tensor)
+        else:
+            return self.unpack_tensor_with_numpy(packed_tensor)
 
     def pack_tensor_with_torch(self, raw_tensor):
         """Pack the tensor with torch.
@@ -709,6 +813,45 @@ def extra_repr(self) -> str:
         return tmp_str
 
 
+# TODO: implement HPUWeightOnlyLinear
+# temporarily let HPUWeightOnlyLinear inherit INCWeightOnlyLinear
+# should be 'class HPUWeightOnlyLinear(WeightOnlyLinear)'
+class HPUWeightOnlyLinear(INCWeightOnlyLinear):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        dtype="int",
+        bits=4,
+        group_size=32,
+        zp=False,
+        bias=False,
+        scale_dtype=torch.float32,
+        compression_dtype=torch.int32,
+        compression_dim=1,
+        g_idx=False,
+        device="hpu",
+        use_optimum_format=True,
+        **kwargs,
+    ):
+        super(HPUWeightOnlyLinear, self).__init__(
+            in_features,
+            out_features,
+            dtype,
+            bits,
+            group_size,
+            zp,
+            bias,
+            scale_dtype,
+            compression_dtype,
+            compression_dim,
+            g_idx,
+            device,
+            use_optimum_format,
+            **kwargs,
+        )
+
+
 class FakeAffineTensorQuantFunction(Function):
     """Fake version of affine quantization."""
 
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 509674d01c6..a5a9932f08b 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -36,6 +36,7 @@
 )
 
 from .utility import cast_fp8, quant_tensor, search_clip
+from .modules import INCWeightOnlyLinear
 
 if is_transformers_imported():
     import transformers
@@ -226,9 +227,8 @@ def convert(
                 int_weight = int_weight.t_().contiguous()
                 scale = scale.t_().contiguous()
                 zp = zp.t_().contiguous() if zp is not None else zp
-            from .modules import WeightOnlyLinear
 
-            new_module = WeightOnlyLinear(
+            new_module = INCWeightOnlyLinear(
                 in_features,
                 out_features,
                 dtype=dtype,
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
index b3e2d95523b..33e2cec187e 100644
--- a/neural_compressor/torch/algorithms/weight_only/save_load.py
+++ b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -18,11 +18,26 @@
 import json
 import os
 import re
+import tempfile
 
 import torch
 
-from neural_compressor.common.utils import load_config_mapping, save_config_mapping
-from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, LoadFormat, logger
+from neural_compressor.common.utils import save_config_mapping
+from neural_compressor.torch.utils import (
+    HPU_SAFE_WEIGHTS_NAME,
+    HPU_WEIGHT_NAME,
+    QCONFIG_NAME,
+    WEIGHT_NAME,
+    LoadFormat,
+    logger,
+    set_module,
+)
+
+from .modules import HPUWeightOnlyLinear, INCWeightOnlyLinear, MulLinear
+from .utility import convert_dtype_str2torch
+
+format_woqlinear_mapping = {LoadFormat.HUGGINGFACE: INCWeightOnlyLinear, LoadFormat.DEFAULT: INCWeightOnlyLinear}
+device_woqlinear_mapping = {"cpu": INCWeightOnlyLinear, "hpu": HPUWeightOnlyLinear}
 
 
 def save(model, output_dir="./saved_results"):
@@ -47,7 +62,7 @@ def save(model, output_dir="./saved_results"):
     del model.save
     torch.save(model.state_dict(), qmodel_weight_file_path)
 
-    logger.info("Save quantized model to {}.".format(qmodel_weight_file_path))
+    logger.info("Save quantized model weight to {}.".format(qmodel_weight_file_path))
     logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path))
 
 
@@ -55,8 +70,13 @@ def load(model_name_or_path, original_model=None, format=LoadFormat.DEFAULT, dev
     """Load quantized weight-only quantization model.
 
     1. Load INC weight-only quantized model in local.
-    2. Load HuggingFace weight-only quantized model,
-       including GPTQ/AWQ models and upstreamed INC quantized models in HF model hub.
+        from neural_compressor.torch.quantization import load
+        load(model_name_or_path="saved_results", original_model=fp32_model, format="default", device="cpu")
+
+    2. Load HuggingFace weight-only quantized model, including GPTQ models and
+       upstreamed INC quantized models in HF model hub.
+        from neural_compressor.torch.quantization import load
+        load(model_name_or_path=model_name_or_path, format="huggingface", device="cpu")
 
     Args:
         model_name_or_path (str):  torch checkpoint directory or hugginface model_name_or_path.
@@ -65,7 +85,7 @@ def load(model_name_or_path, original_model=None, format=LoadFormat.DEFAULT, dev
             Parameter should not be None. it coworks with 'original_model' parameter to load INC
             weight-only quantized model in local.
         original_model (torch.nn.module, optional): original model before quantization.
-            Needed if 'format' is set to 'default' and not TorchScript model.Defaults to None.
+            Needed if 'format' is set to 'default'. Defaults to None.
         format (str, optional): 'defult' for loading INC weight-only quantized model.
             'huggingface' for loading huggingface WOQ causal language model. Defaults to "default".
         kwargs (remaining dictionary of keyword arguments, optional):
@@ -85,14 +105,16 @@ class WOQModelLoader:
 
     def __init__(self, model_name_or_path, original_model=None, format=LoadFormat.DEFAULT, device="cpu", **kwargs):
         """Init the WOQModelLoader object."""
-        # TODO: When loading WOQ model, use different WeightOnlyLinear module according to device.
         self.model_name_or_path = model_name_or_path
         self.original_model = original_model
         self.format = format
         self.device = device
         self.kwargs = kwargs
         self.quantization_config = {}
-        self.loaded_state_dict_keys = {}
+        self.loaded_state_dict = {}
+        self.loaded_state_dict_keys = []
+        self._should_save_hpu_format_tensor = False
+        self._model_local_dir = None  # local directory where model files are saved
 
     def load_woq_model(self):
         """Load quantized weight-only quantization model.
@@ -104,55 +126,68 @@ def load_woq_model(self):
             torch.nn.Module: quantized model
         """
         if self.format == LoadFormat.HUGGINGFACE:
+            assert self.model_name_or_path is not None, "'model_name_or_path' can't be None."
+
             model = self.load_hf_format_woq_model()
             logger.info("Loading HuggingFace weight-only quantization model successfully.")
         elif self.format == LoadFormat.DEFAULT:
-            qmodel_weight_file_path = os.path.join(
-                os.path.abspath(os.path.expanduser(self.model_name_or_path)), WEIGHT_NAME
-            )
-            assert os.path.exists(qmodel_weight_file_path), (
-                "Cannot load model weight from path {}. "
-                "Please make sure '{}' file is saved in your '{}' directory ".format(
-                    qmodel_weight_file_path, WEIGHT_NAME, self.model_name_or_path
-                )
-            )
-
-            qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(self.model_name_or_path)), QCONFIG_NAME)
-            assert os.path.exists(qconfig_file_path), (
-                "Cannot load model quantization config from path {}. "
-                "Please make sure '{}' file is saved in your '{}' directory".format(
-                    qconfig_file_path, QCONFIG_NAME, self.model_name_or_path
-                )
-            )
-
+            assert os.path.exists(self.model_name_or_path), f"'{self.model_name_or_path}' path doesn't exist."
             assert (
                 self.original_model is not None
             ), "Can't get original model. Please pass `original_model` to load function."
 
-            model = self.load_inc_format_woq_model(qmodel_weight_file_path, qconfig_file_path)
+            model = self.load_inc_format_woq_model()
             logger.info("Loading weight-only quantization model successfully.")
         else:
             raise ValueError(f"`format` in load function can only be 'huggingface' or 'default', but get {self.format}")
 
         return model
 
-    def load_inc_format_woq_model(self, qmodel_weight_file_path, qconfig_file_path):
-        """Load INC weight-only quantized model in local.
+    def load_inc_format_woq_model(self):
+        self._model_local_dir = self.model_name_or_path
 
-        Args:
-            qmodel_weight_file_path (str): path to the quantized model.
-            qconfig_file_path (str): path to the quant config.
+        qmodel_weight_file_path = os.path.join(
+            os.path.abspath(os.path.expanduser(self.model_name_or_path)), WEIGHT_NAME
+        )
+        # if hpu format tensor can be used directly, then update qmodel_weight_file_path to the hpu format tensor file
+        if self._use_hpu_module():
+            qmodel_weight_file_path = os.path.join(
+                os.path.abspath(os.path.expanduser(self.model_name_or_path)), HPU_WEIGHT_NAME
+            )
+        assert os.path.exists(qmodel_weight_file_path), (
+            "Cannot load model weight from path {}. "
+            "Please make sure '{}' file is saved in your '{}' directory ".format(
+                qmodel_weight_file_path, WEIGHT_NAME, self.model_name_or_path
+            )
+        )
+        logger.info(f"Find weight file {qmodel_weight_file_path}")
 
-        Returns:
-            torch.nn.Module: quantized model
-        """
-        qweights = torch.load(qmodel_weight_file_path)
-        self.loaded_state_dict_keys = qweights.keys()
+        qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(self.model_name_or_path)), QCONFIG_NAME)
+        assert os.path.exists(qconfig_file_path), (
+            "Cannot load model quantization config from path {}. "
+            "Please make sure '{}' file is saved in your '{}' directory".format(
+                qconfig_file_path, QCONFIG_NAME, self.model_name_or_path
+            )
+        )
 
+        # get loaded state_dict
+        self.loaded_state_dict = torch.load(qmodel_weight_file_path)
+        self.loaded_state_dict_keys = list(set(self.loaded_state_dict.keys()))
+
+        # get qconfig
         with open(qconfig_file_path, "r") as file:
             self.quantization_config = json.load(file)
+
+        # build weight-only quantization model with WeightOnlyLinear module
         model = self._build_woq_model()
-        model.load_state_dict(qweights, assign=True)
+
+        # load remaining pretrained weight to weight-only quantization model
+        model.load_state_dict(self.loaded_state_dict, assign=True, strict=False)
+
+        # save hpu format tensor to local directory
+        if self._should_save_hpu_format_tensor:
+            self._save_hpu_format_tensor(model)
+
         model.eval()
         return model
 
@@ -170,22 +205,32 @@ def load_hf_format_woq_model(self):
         if not is_package_available("accelerate"):
             raise ImportError("Loading huggingface model requires accelerate: `pip install accelerate`")
 
-        # get model_class and config
+        # get model class and config
         model_class, config = self._get_model_class_and_config()
         self.quantization_config = config.quantization_config
 
-        # get loaded_state_dict_keys
-        self.loaded_state_dict_keys = self._get_loaded_state_dict_keys(config)
+        # get loaded state_dict
+        self.loaded_state_dict = self._get_loaded_state_dict(config)
+        self.loaded_state_dict_keys = list(set(self.loaded_state_dict.keys()))
 
-        # initiate the huggingface model
+        # initiate the huggingface model (FP32 empty model)
         self.original_model = self._init_hf_model(model_class, config)
 
         # build weight-only quantization model with WeightOnlyLinear module
+        # and load quantized weight to WeightOnlyLinear modules
         model = self._build_woq_model()
 
-        # load quantized weight to woq model
-        model = self._load_pretrained_weight(model, model_class)
+        # clear loaded_state_dict
+        self.loaded_state_dict = {}
+
+        # load remaining pretrained weight to weight-only quantization model
+        model = self._load_remaining_pretrained_weight(model)
 
+        # save hpu format tensor to local directory
+        if self._should_save_hpu_format_tensor:
+            self._save_hpu_format_tensor(model)
+
+        model.eval()
         return model
 
     def _is_hqq_model(self):
@@ -196,96 +241,158 @@ def _is_hqq_model(self):
                     if isinstance(q_config_value, dict) and [algo for algo in q_config_value.keys()][0] == "hqq":
                         return True
 
+    def _build_hqq_model(self):
+        """Replace quantized Linear with HQQLinear."""
+        from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear
+        from neural_compressor.torch.utils import set_module
+
+        for name, module in self.original_model.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                loaded_state_dict_keys_set = set(self.loaded_state_dict_keys)
+                if name + ".val" not in loaded_state_dict_keys_set:
+                    continue
+                new_module = HQQLinear(
+                    in_features=module.in_features, out_features=module.out_features, bias=module.bias is not None
+                )
+                set_module(self.original_model, name, new_module)
+        woq_model = self.original_model
+        return woq_model
+
     def _build_woq_model(self):
         """Build weight-only quantization model."""
         if self._is_hqq_model():
             return self._build_hqq_model()
 
-        from neural_compressor.torch.utils import set_module
-
-        from .modules import MulLinear
+        self._update_format_woqlinear_mapping()
 
         for name, module in self.original_model.named_modules():
-            _is_autoround = False
-            # get quantization config of module
-            module_quantization_config = self.quantization_config
-            # pattern will map (module_name, moduele_type)
-            pattern = rf"(\(.*{re.escape(name)}.*{re.escape(type(module).__name__)}.*\))"
-            for q_config_key, q_config_value in self.quantization_config.items():
-                if re.search(pattern, q_config_key):
-                    if isinstance(q_config_value, dict) and [algo for algo in q_config_value.keys()][0] == "autoround":
-                        _is_autoround = True
-                    module_quantization_config = [config for config in q_config_value.values()][0]
-
+            # replace `torch.nn.Linear` to `WeightOnlyLinear` in self.original_model and load its quantized data
             if isinstance(module, torch.nn.Linear):
                 # module without qweight means it is not quantized, then skip it
-                loaded_state_dict_keys_set = set(self.loaded_state_dict_keys)
                 if (
-                    name + ".qweight" not in loaded_state_dict_keys_set
-                    and name + ".linear.qweight" not in loaded_state_dict_keys_set
+                    name + ".qweight" not in self.loaded_state_dict_keys
+                    and name + ".linear.qweight" not in self.loaded_state_dict_keys
                 ):
                     continue
 
-                # insert MulLinear module
-                if name + ".linear.qweight" in loaded_state_dict_keys_set:
-                    new_module = MulLinear(module)
-                    set_module(self.original_model, name, new_module)
-                    name += ".linear"
+                module_quantization_config, _is_autoround = self._get_module_quantization_config(name, module)
+                self._replace_woqlinear_modules(name, module, module_quantization_config, _is_autoround)
 
-                # replace `torch.nn.Linear` with `WeightOnlyLinear`
-                zp = True if name + ".qzeros" in loaded_state_dict_keys_set else False
-                g_idx = True if name + ".g_idx" in loaded_state_dict_keys_set else False
+        woq_model = self.original_model
+        return woq_model
 
-                kwargs = {}
-                if _is_autoround:
-                    from auto_round.export.export_to_itrex.model_wrapper import (
-                        WeightOnlyLinear as AutoRoundWeightOnlyLinear,
-                    )
+    def _update_format_woqlinear_mapping(self):
+        """Update format mapping module to HPUWeightOnlyLinear if tensor is hpu format."""
+        if self._use_hpu_module():
+            format_woqlinear_mapping.update({self.format: HPUWeightOnlyLinear})
 
-                    from .utility import convert_dtype_str2torch
+        logger.debug(
+            f"Build weight-only quantization model according to format and device mapping. \n"
+            f"Format mapping is {format_woqlinear_mapping}. \n"
+            f"Device mapping is {device_woqlinear_mapping}."
+        )
 
-                    WeightOnlyLinearClass = AutoRoundWeightOnlyLinear
-                    kwargs["groupsize"] = module_quantization_config.get("group_size", 32)
-                    kwargs["scale_dtype"] = convert_dtype_str2torch(
-                        module_quantization_config.get("scale_dtype", "fp16")
-                    )
-                else:
-                    from .modules import WeightOnlyLinear as INCWeightOnlyLinear
-
-                    WeightOnlyLinearClass = INCWeightOnlyLinear
-                    kwargs["group_size"] = module_quantization_config.get("group_size", 32)
-                    kwargs["g_idx"] = g_idx
-
-                new_module = WeightOnlyLinearClass(
-                    module.in_features,
-                    module.out_features,
-                    dtype=module_quantization_config.get("dtype", "int"),
-                    bits=module_quantization_config.get("bits", 4),
-                    zp=zp,
-                    bias=module.bias is not None,
-                    use_optimum_format=True,
-                    **kwargs,
-                )
-                set_module(self.original_model, name, new_module)
-        woq_model = self.original_model
-        return woq_model
+    def _get_module_quantization_config(self, module_name, module):
+        """Gt quantization config of current module.
 
-    def _build_hqq_model(self):
-        """Replace quantized Linear with HQQLinear."""
-        from neural_compressor.torch.algorithms.weight_only.hqq.core import HQQLinear
-        from neural_compressor.torch.utils import set_module
+        1. INC weight-only quantization model, quantization_config will be structured in module level like:
+            {(module1_name, module1_type): {"rtn": {"bits": 4, ...}}, ...}
+        2. HF weight-only quantization model, quantization_config will be structured in model level like:
+            {'bits': 4, ...}
+        """
+        module_quantization_config = self.quantization_config
+        pattern = rf"(\(.*{re.escape(module_name)}.*{re.escape(type(module).__name__)}.*\))"
+        _is_autoround = False
+        # for loop is used to find quantization config of the target module in INC weight-only quantization model
+        for q_config_key, q_config_value in self.quantization_config.items():
+            if re.search(pattern, q_config_key):
+                # pattern will map (module_name, moduele_type)
+                if isinstance(q_config_value, dict) and [algo for algo in q_config_value.keys()][0] == "autoround":
+                    _is_autoround = True
+                module_quantization_config = [config for config in q_config_value.values()][0]
+        return module_quantization_config, _is_autoround
+
+    def _replace_woqlinear_modules(self, name, linear_module, module_quantization_config, _is_autoround):
+        """Replace torch.nn.Linear modules with WeightOnlyLinear and load its quantized data."""
+        # insert MulLinear module for AWQ/TEQ algorithm
+        if name + ".linear.qweight" in self.loaded_state_dict_keys:
+            new_module = MulLinear(linear_module)
+            set_module(self.original_model, name, new_module)
+            name += ".linear"
+
+        # get format mapping module class
+        WeightOnlyLinearClass = format_woqlinear_mapping[self.format]
+
+        # update initialization kwargs for woq linear module
+        module_kwargs = {}
+
+        # base initialization kwargs
+        module_kwargs["in_features"] = linear_module.in_features
+        module_kwargs["out_features"] = linear_module.out_features
+        module_kwargs["dtype"] = module_quantization_config.get("dtype", "int")
+        module_kwargs["bits"] = module_quantization_config.get("bits", 4)
+        module_kwargs["group_size"] = module_quantization_config.get("group_size", 32)
+
+        # spceific initialization kwargs
+        module_kwargs["g_idx"] = True if name + ".g_idx" in self.loaded_state_dict_keys else False
+        module_kwargs["zp"] = True if name + ".qzeros" in self.loaded_state_dict_keys else False
+        module_kwargs["use_optimum_format"] = True
+        module_kwargs["bias"] = linear_module.bias is not None
+        if _is_autoround:
+            module_kwargs["scale_dtype"] = convert_dtype_str2torch(
+                module_quantization_config.get("scale_dtype", "fp16")
+            )
 
-        for name, module in self.original_model.named_modules():
-            if isinstance(module, torch.nn.Linear):
-                loaded_state_dict_keys_set = set(self.loaded_state_dict_keys)
-                if name + ".val" not in loaded_state_dict_keys_set:
-                    continue
-                new_module = HQQLinear(
-                    in_features=module.in_features, out_features=module.out_features, bias=module.bias is not None
-                )
-                set_module(self.original_model, name, new_module)
-        woq_model = self.original_model
-        return woq_model
+        # initialize the new WeightOnlyLinearClass
+        new_module = WeightOnlyLinearClass(**module_kwargs)
+
+        # load quantized data of current module
+        self._load_data_to_new_module(new_module, name)
+
+        # update mapped woqlinear module if needed
+        new_module = self._update_mapped_woqlinear_modules(name, new_module, module_kwargs)
+
+        set_module(self.original_model, name, new_module)
+
+    def _load_data_to_new_module(self, new_module, module_name):
+        new_module_state_dict = {}
+        for key in [".qweight", ".scales", ".qzeros", ".bias", ".g_idx"]:
+            full_name = module_name + key
+            if full_name in self.loaded_state_dict:
+                new_module_state_dict[key[1:]] = self.loaded_state_dict.pop(full_name)
+                self.loaded_state_dict_keys.remove(full_name)
+        new_module.load_state_dict(new_module_state_dict, strict=False)  # bias is not needed.
+
+    def _update_mapped_woqlinear_modules(self, name, format_woqlinear_module, module_kwargs):
+        """Checks whether the format mapping module needs to be updated to the device mapping module."""
+        # format mapping module class
+        OldWeightOnlyLinearClass = format_woqlinear_mapping[self.format]
+
+        # deivice mapping module class
+        NewWeightOnlyLinearClass = device_woqlinear_mapping[self.device]
+
+        # if format mapping module doesn't match device mapping module, then replace to device mapping module
+        if OldWeightOnlyLinearClass != NewWeightOnlyLinearClass:
+            logger.debug(
+                f"Replacing {name}'s type from "
+                f"'{OldWeightOnlyLinearClass.__name__}' "
+                f"to '{NewWeightOnlyLinearClass.__name__}'"
+            )
+
+            # initialize the new WeightOnlyLinearClass
+            device_mapping_module = NewWeightOnlyLinearClass(**module_kwargs)
+
+            # unpack format mapping module and re-pack to device mapping module
+            params_dict = format_woqlinear_module.unpack().to(self.device)
+            device_mapping_module.pack(**params_dict)
+
+            # if the new module is HPUWeightOnlyLinear, save hpu format tensor for next loading
+            if NewWeightOnlyLinearClass == HPUWeightOnlyLinear and not self._should_save_hpu_format_tensor:
+                self._should_save_hpu_format_tensor = True
+
+            return device_mapping_module
+        else:
+            return format_woqlinear_module
 
     def _get_model_class_and_config(self):
         from transformers import AutoConfig, AutoModelForCausalLM
@@ -325,21 +432,10 @@ def _get_model_class_and_config(self):
 
         return model_class, config
 
-    def _get_loaded_state_dict_keys(self, config):
+    def _get_loaded_state_dict(self, config):
         from transformers.configuration_utils import PretrainedConfig
-        from transformers.modeling_utils import _add_variant, get_checkpoint_shard_files, load_state_dict
-        from transformers.utils import (
-            SAFE_WEIGHTS_INDEX_NAME,
-            SAFE_WEIGHTS_NAME,
-            WEIGHTS_INDEX_NAME,
-            WEIGHTS_NAME,
-            cached_file,
-            download_url,
-            extract_commit_hash,
-            has_file,
-            is_remote_url,
-            is_safetensors_available,
-        )
+        from transformers.modeling_utils import get_checkpoint_shard_files, load_state_dict
+        from transformers.utils import cached_file, extract_commit_hash, is_safetensors_available
 
         subfolder = self.kwargs.pop("subfolder", "")
         variant = self.kwargs.pop("variant", None)
@@ -356,11 +452,15 @@ def _get_loaded_state_dict_keys(self, config):
         from_auto_class = self.kwargs.pop("_from_auto", False)
         revision = self.kwargs.pop("revision", "main")
         commit_hash = self.kwargs.pop("_commit_hash", None)
+        use_hpu_safetensors = self.kwargs.pop("use_hpu_safetensors", None)
         use_safetensors = self.kwargs.pop("use_safetensors", None)
 
         if use_safetensors is None and not is_safetensors_available():
             use_safetensors = False
 
+        if use_hpu_safetensors is None and not is_safetensors_available():
+            use_hpu_safetensors = False
+
         if use_auth_token is not None:  # pragma: no cover
             logger.warn(
                 "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. "
@@ -405,165 +505,33 @@ def _get_loaded_state_dict_keys(self, config):
         is_sharded = False
         sharded_metadata = None
 
-        if self.model_name_or_path is not None:  # pragma: no cover
-            self.model_name_or_path = str(self.model_name_or_path)
-            is_local = os.path.isdir(self.model_name_or_path)
-            if is_local:
-                if os.path.isfile(
-                    os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(WEIGHTS_NAME, variant),
-                    )
-                ):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(WEIGHTS_NAME, variant),
-                    )
-                elif os.path.isfile(
-                    os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(WEIGHTS_INDEX_NAME, variant),
-                    )
-                ):
-                    # Load from a sharded PyTorch checkpoint
-                    archive_file = os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(WEIGHTS_INDEX_NAME, variant),
-                    )
-                    is_sharded = True
-                elif os.path.isfile(
-                    os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(SAFE_WEIGHTS_NAME, variant),
-                    )
-                ):
-                    # Load from a safetensors checkpoint
-                    archive_file = os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(SAFE_WEIGHTS_NAME, variant),
-                    )
-                elif os.path.isfile(
-                    os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
-                    )
-                ):
-                    # Load from a safetensors checkpoint
-                    archive_file = os.path.join(
-                        self.model_name_or_path,
-                        subfolder,
-                        _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
-                    )
-                    is_sharded = True
-            elif os.path.isfile(os.path.join(subfolder, self.model_name_or_path)):
-                archive_file = self.model_name_or_path
-                is_local = True
-            elif is_remote_url(self.model_name_or_path):
-                filename = self.model_name_or_path
-                resolved_archive_file = download_url(self.model_name_or_path)
-            else:
-                if use_safetensors is not False:
-                    filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
-                else:
-                    filename = _add_variant(WEIGHTS_NAME, variant)
-                try:
-                    # Load from URL or cache if already cached
-                    cached_file_kwargs = {
-                        "cache_dir": cache_dir,
-                        "force_download": force_download,
-                        "proxies": proxies,
-                        "resume_download": resume_download,
-                        "local_files_only": local_files_only,
-                        "token": token,
-                        "user_agent": user_agent,
-                        "revision": revision,
-                        "subfolder": subfolder,
-                        "_raise_exceptions_for_gated_repo": False,
-                        "_raise_exceptions_for_missing_entries": False,
-                        "_commit_hash": commit_hash,
-                    }
-                    resolved_archive_file = cached_file(self.model_name_or_path, filename, **cached_file_kwargs)
-
-                    # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
-                    # result when internet is up, the repo and revision exist, but the file does not.
-                    if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant):
-                        # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-                        resolved_archive_file = cached_file(
-                            self.model_name_or_path,
-                            _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
-                            **cached_file_kwargs,
-                        )
-                        if resolved_archive_file is not None:
-                            is_sharded = True
-                        elif use_safetensors:
-                            raise EnvironmentError(
-                                f"{self.model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or "
-                                f"{_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} "
-                                "and thus cannot be loaded with `safetensors`. Please make sure that the model has "
-                                "been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
-                            )
-                        else:
-                            # This repo has no safetensors file of any kind, we switch to PyTorch.
-                            filename = _add_variant(WEIGHTS_NAME, variant)
-                            resolved_archive_file = cached_file(self.model_name_or_path, filename, **cached_file_kwargs)
-                    if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant):
-                        # Maybe the checkpoint is sharded, we try to grab the index name in this case.
-                        resolved_archive_file = cached_file(
-                            self.model_name_or_path,
-                            _add_variant(WEIGHTS_INDEX_NAME, variant),
-                            **cached_file_kwargs,
-                        )
-                        if resolved_archive_file is not None:
-                            is_sharded = True
-
-                    if resolved_archive_file is None:
-                        # Otherwise, maybe there is a TF or Flax model file.  We try those to give a helpful error
-                        # message.
-                        has_file_kwargs = {
-                            "revision": revision,
-                            "proxies": proxies,
-                            "token": token,
-                        }
-                        if variant is not None and has_file(self.model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
-                            raise EnvironmentError(
-                                f"{self.model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
-                                f" {variant}. Use `variant=None` to load this model from those weights."
-                            )
-                        else:
-                            raise EnvironmentError(
-                                f"{self.model_name_or_path} does not appear to have a file named"
-                                f" {_add_variant(WEIGHTS_NAME, variant)}."
-                            )
-                except EnvironmentError:
-                    # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
-                    # to the original exception.
-                    raise
-                except Exception as e:
-                    # For any other exception, we throw a generic error.
-                    raise EnvironmentError(
-                        f"Can't load the model for '{self.model_name_or_path}'. If you were trying to load it"
-                        " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-                        f" same name. Otherwise, make sure '{self.model_name_or_path}' is the correct path to a"
-                        f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}."
-                    ) from e
-
-            if is_local:
-                logger.info(f"loading weights file {archive_file}")
-                resolved_archive_file = archive_file
-            else:
-                logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}")
-        else:  # pragma: no cover
-            resolved_archive_file = None
+        self.model_name_or_path = str(self.model_name_or_path)
+
+        # get resolved weight archive file
+        kwargs = {
+            "use_safetensors": use_safetensors,
+            "variant": variant,
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "resume_download": resume_download,
+            "local_files_only": local_files_only,
+            "token": token,
+            "user_agent": user_agent,
+            "revision": revision,
+            "subfolder": subfolder,
+            "_raise_exceptions_for_gated_repo": False,
+            "_raise_exceptions_for_missing_entries": False,
+            "_commit_hash": commit_hash,
+        }
+        resolved_archive_file = self._get_resolved_archive_file(**kwargs)
+
+        self._model_local_dir = os.path.abspath(os.path.expanduser(os.path.dirname(resolved_archive_file)))
+        # if hpu format tensor can be used directly, then update resolved_archive_file to the hpu format tensor file
+        if self._use_hpu_module():
+            resolved_archive_file = os.path.join(self._model_local_dir, HPU_SAFE_WEIGHTS_NAME)
+
+        logger.info(f"Find weight file {resolved_archive_file}")
 
         if is_sharded:  # pragma: no cover
             # rsolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
@@ -583,12 +551,15 @@ def _get_loaded_state_dict_keys(self, config):
             )
             self.kwargs["sharded_metadata"] = sharded_metadata
 
-        if is_sharded:  # pragma: no cover
-            loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
-        else:
-            # Time to load the checkpoint
-            state_dict = load_state_dict(resolved_archive_file)
-            loaded_state_dict_keys = list(state_dict.keys())
+        # Time to load the checkpoint
+        state_dict = None
+        if not isinstance(resolved_archive_file, list):
+            resolved_archive_file = [resolved_archive_file]
+        for shard_file in resolved_archive_file:
+            if state_dict is None:
+                state_dict = load_state_dict(shard_file)
+            else:
+                state_dict.update(load_state_dict(shard_file))
 
         # set kwargs for next functions to use
         self.kwargs["is_sharded"] = is_sharded
@@ -596,7 +567,175 @@ def _get_loaded_state_dict_keys(self, config):
         self.kwargs["offload_state_dict"] = offload_state_dict
         self.kwargs["resolved_archive_file"] = resolved_archive_file
 
-        return loaded_state_dict_keys
+        return state_dict
+
+    def _get_resolved_archive_file(self, **kwargs):
+        """Get weight archive file of model."""
+        from transformers.modeling_utils import _add_variant
+        from transformers.utils import (
+            SAFE_WEIGHTS_INDEX_NAME,
+            SAFE_WEIGHTS_NAME,
+            WEIGHTS_INDEX_NAME,
+            WEIGHTS_NAME,
+            cached_file,
+            download_url,
+            has_file,
+            is_remote_url,
+        )
+
+        use_safetensors = kwargs.pop("use_safetensors")
+        variant = kwargs.pop("variant")
+        subfolder = kwargs.get("subfolder")
+
+        resolved_archive_file = None
+        is_local = os.path.isdir(self.model_name_or_path)
+        if is_local:  # pragma: no cover
+            # self.model_name_or_path is a local directory
+            if os.path.isfile(
+                os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(WEIGHTS_NAME, variant),
+                )
+            ):
+                # Load from a PyTorch checkpoint
+                archive_file = os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(WEIGHTS_NAME, variant),
+                )
+            elif os.path.isfile(
+                os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(WEIGHTS_INDEX_NAME, variant),
+                )
+            ):
+                # Load from a sharded PyTorch checkpoint
+                archive_file = os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(WEIGHTS_INDEX_NAME, variant),
+                )
+                is_sharded = True
+            elif os.path.isfile(
+                os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(SAFE_WEIGHTS_NAME, variant),
+                )
+            ):
+                # Load from a safetensors checkpoint
+                archive_file = os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(SAFE_WEIGHTS_NAME, variant),
+                )
+            elif os.path.isfile(
+                os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
+                )
+            ):
+                # Load from a safetensors checkpoint
+                archive_file = os.path.join(
+                    self.model_name_or_path,
+                    subfolder,
+                    _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
+                )
+                is_sharded = True
+        elif os.path.isfile(os.path.join(subfolder, self.model_name_or_path)):  # pragma: no cover
+            archive_file = self.model_name_or_path
+            is_local = True
+        elif is_remote_url(self.model_name_or_path):  # pragma: no cover
+            # self.model_name_or_path is a url
+            filename = self.model_name_or_path
+            resolved_archive_file = download_url(self.model_name_or_path)
+        else:
+            # self.model_name_or_path is a model_id in huggingface
+            if use_safetensors is not False:
+                filename = _add_variant(SAFE_WEIGHTS_NAME, variant)
+            else:
+                filename = _add_variant(WEIGHTS_NAME, variant)
+            try:
+                # Load from URL or cache if already cached
+                cached_file_kwargs = kwargs
+                resolved_archive_file = cached_file(self.model_name_or_path, filename, **cached_file_kwargs)
+
+                # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None
+                # result when internet is up, the repo and revision exist, but the file does not.
+                if resolved_archive_file is None and filename == _add_variant(
+                    SAFE_WEIGHTS_NAME, variant
+                ):  # pragma: no cover
+                    # Maybe the checkpoint is sharded, we try to grab the index name in this case.
+                    resolved_archive_file = cached_file(
+                        self.model_name_or_path,
+                        _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
+                        **cached_file_kwargs,
+                    )
+                    if resolved_archive_file is not None:
+                        is_sharded = True
+                    elif use_safetensors:
+                        raise EnvironmentError(
+                            f"{self.model_name_or_path} does not appear to have a file named"
+                            f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or "
+                            f"{_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} "
+                            "and thus cannot be loaded with `safetensors`. Please make sure that the model has "
+                            "been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
+                        )
+                    else:
+                        # This repo has no safetensors file of any kind, we switch to PyTorch.
+                        filename = _add_variant(WEIGHTS_NAME, variant)
+                        resolved_archive_file = cached_file(self.model_name_or_path, filename, **cached_file_kwargs)
+                if resolved_archive_file is None and filename == _add_variant(
+                    WEIGHTS_NAME, variant
+                ):  # pragma: no cover
+                    # Maybe the checkpoint is sharded, we try to grab the index name in this case.
+                    resolved_archive_file = cached_file(
+                        self.model_name_or_path,
+                        _add_variant(WEIGHTS_INDEX_NAME, variant),
+                        **cached_file_kwargs,
+                    )
+                    if resolved_archive_file is not None:
+                        is_sharded = True
+
+                if resolved_archive_file is None:  # pragma: no cover
+                    # Otherwise, maybe there is a TF or Flax model file.  We try those to give a helpful error
+                    # message.
+                    has_file_kwargs = {
+                        "revision": cached_file_kwargs.get("revision"),
+                        "proxies": cached_file_kwargs.get("proxies"),
+                        "token": cached_file_kwargs.get("token"),
+                    }
+                    if variant is not None and has_file(self.model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
+                        raise EnvironmentError(
+                            f"{self.model_name_or_path} does not appear to have a file named"
+                            f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant"
+                            f" {variant}. Use `variant=None` to load this model from those weights."
+                        )
+                    else:
+                        raise EnvironmentError(
+                            f"{self.model_name_or_path} does not appear to have a file named"
+                            f" {_add_variant(WEIGHTS_NAME, variant)}."
+                        )
+            except EnvironmentError:  # pragma: no cover
+                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
+                # to the original exception.
+                raise
+            except Exception as e:  # pragma: no cover
+                # For any other exception, we throw a generic error.
+                raise EnvironmentError(
+                    f"Can't load the model for '{self.model_name_or_path}'. If you were trying to load it"
+                    " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+                    f" same name. Otherwise, make sure '{self.model_name_or_path}' is the correct path to a"
+                    f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}."
+                ) from e
+
+        if is_local:
+            resolved_archive_file = archive_file
+
+        return resolved_archive_file
 
     def _init_hf_model(self, model_class, config):
         from accelerate.big_modeling import init_empty_weights
@@ -619,14 +758,14 @@ def _init_hf_model(self, model_class, config):
         dtype_orig = None
         if torch_dtype is not None:
             if isinstance(torch_dtype, str):
-                if torch_dtype == "auto":
+                if torch_dtype == "auto":  # pragma: no cover
                     if (
                         hasattr(config, "torch_dtype")
                         and config.torch_dtype is not None
                         and config.torch_dtype != "auto"
                     ):
                         torch_dtype = config.torch_dtype
-                    else:  # pragma: no cover
+                    else:
                         if is_sharded and "dtype" in sharded_metadata:
                             torch_dtype = sharded_metadata["dtype"]
                         else:
@@ -644,21 +783,25 @@ def _init_hf_model(self, model_class, config):
 
         # set kwargs for next functions to use
         self.kwargs["resolved_archive_file"] = resolved_archive_file
-        self.kwargs["sharded_metadata"] = sharded_metadata
         self.kwargs["torch_dtype"] = torch_dtype
         self.kwargs["dtype_orig"] = dtype_orig
-        self.kwargs["_fast_init"] = _fast_init
         self.kwargs["offload_folder"] = offload_folder
         self.kwargs["offload_state_dict"] = offload_state_dict
 
         return model
 
-    def _load_pretrained_weight(self, model, model_class):
+    def _load_remaining_pretrained_weight(self, model):
+        """Load remaining pretrained weight.
+
+        In _build_woq_model function, linear will be replaced to weight-only quantization linear
+        and its quantized weight will be loaded. Remaining pretrained weight (like layernorm weight,
+        embedding weight or other unquantized linear weight) will be loaded in this function.
+        """
+        from transformers.modeling_utils import _load_state_dict_into_meta_model, load_state_dict
+
         resolved_archive_file = self.kwargs.pop("resolved_archive_file", None)
-        sharded_metadata = self.kwargs.pop("sharded_metadata", None)
         torch_dtype = self.kwargs.pop("torch_dtype", torch.float32)
         dtype_orig = self.kwargs.pop("dtype_orig", None)
-        _fast_init = self.kwargs.pop("_fast_init", True)
         offload_folder = self.kwargs.pop("offload_folder", None)
         offload_state_dict = self.kwargs.pop("offload_state_dict", False)
 
@@ -666,27 +809,23 @@ def _load_pretrained_weight(self, model, model_class):
         if dtype_orig is not None:
             torch.set_default_dtype(dtype_orig)
 
-        (
-            model,
-            missing_keys,
-            unexpected_keys,
-            mismatched_keys,
-            offload_index,
-            error_msgs,
-        ) = model_class._load_pretrained_model(
-            model,
-            None,
-            self.loaded_state_dict_keys,
-            resolved_archive_file,
-            self.model_name_or_path,
-            sharded_metadata=sharded_metadata,
-            _fast_init=_fast_init,
-            low_cpu_mem_usage=True,
-            offload_folder=offload_folder,
-            offload_state_dict=offload_state_dict,
-            dtype=torch_dtype,
-            keep_in_fp32_modules=[],
-        )
+        if not isinstance(resolved_archive_file, list):
+            resolved_archive_file = [resolved_archive_file]
+        for shard_file in resolved_archive_file:
+            state_dict = load_state_dict(shard_file)
+            _load_state_dict_into_meta_model(
+                model=model,
+                state_dict=state_dict,
+                loaded_state_dict_keys=self.loaded_state_dict_keys,
+                start_prefix="",
+                expected_keys=list(state_dict.keys()),
+                device_map={"": self.device},
+                offload_folder=offload_folder,
+                state_dict_folder=tempfile.mkdtemp() if offload_state_dict else None,
+                state_dict_index={} if offload_state_dict else None,
+                dtype=torch_dtype,
+                keep_in_fp32_modules=[],
+            )
 
         # make sure token embedding weights are still tied if needed
         model.tie_weights()
@@ -695,3 +834,39 @@ def _load_pretrained_weight(self, model, model_class):
         model.eval()
 
         return model
+
+    def _save_hpu_format_tensor(self, model):  # pragma: no cover
+        from safetensors.torch import save_file
+
+        if not os.path.exists(self._model_local_dir):
+            logger.warning(f"{self._model_local_dir} doesn't exist, can't save hpu format safetensors")
+
+        if self.format == LoadFormat.HUGGINGFACE:
+            filename = os.path.join(self._model_local_dir, HPU_SAFE_WEIGHTS_NAME)
+            save_file({k: v.cpu() for k, v in model.state_dict().items()}, filename=filename, metadata={"format": "pt"})
+            logger.debug(f"Save hpu format tensor to {filename}")
+        elif self.format == LoadFormat.DEFAULT:
+            qmodel_weight_file_path = os.path.join(self._model_local_dir, HPU_WEIGHT_NAME)
+            torch.save({k: v.cpu() for k, v in model.state_dict().items()}, qmodel_weight_file_path)
+            logger.debug(f"Save hpu format tensor to {qmodel_weight_file_path}")
+
+    def _use_hpu_module(self):  # pragma: no cover
+        """Check whether hpu weight-only quantization linear module can be used.
+
+        return True when:
+        1. device is 'hpu'
+        2. model has hpu format tensor in local cache directory:
+            - has 'hpu_model.safetensors' file with huggingface format
+            - or has 'quantized_hpu_weight.pt' file with default format
+           or 'format' flag in config.json file is 'habana' (flag name needs discussion, not implemented yet)
+        """
+        if self.device == "hpu" and os.path.exists(self._model_local_dir):
+            if self.format == LoadFormat.HUGGINGFACE:
+                if os.path.exists(os.path.join(self._model_local_dir, HPU_SAFE_WEIGHTS_NAME)):
+                    # update resolved_archive_file
+                    self.kwargs["resolved_archive_file"] = os.path.join(self._model_local_dir, HPU_SAFE_WEIGHTS_NAME)
+                    return True
+            elif self.format == LoadFormat.DEFAULT:
+                if os.path.exists(os.path.join(self._model_local_dir, HPU_WEIGHT_NAME)):
+                    return True
+        return False
diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py
index 7cc7f8075d0..846b3d4b18e 100644
--- a/neural_compressor/torch/quantization/load_entry.py
+++ b/neural_compressor/torch/quantization/load_entry.py
@@ -49,10 +49,10 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
             from neural_compressor.torch.quantization import load
             load(model_name_or_path='saved_result')
 
-    2. Load HuggingFace quantized model, including GPTQ/AWQ models and upstreamed INC quantized models in HF model hub.
+    2. Load HuggingFace quantized model, including GPTQ models and upstreamed INC quantized models in HF model hub.
         case 1: WOQ
             from neural_compressor.torch.quantization import load
-            load(model_name_or_path=model_name_or_path)
+            load(model_name_or_path=model_name_or_path, format="huggingface")
 
 
     Args:
@@ -66,14 +66,14 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
             Defaults to None.
         format (str, optional): 'defult' for loading INC quantized model.
             'huggingface' for loading huggingface WOQ causal language model. Defaults to "default".
-        device (str, optional): 'cpu', 'hpu' or 'cuda'. specify the device the model will be loaded to.
+        device (str, optional): 'cpu', 'hpu'. specify the device the model will be loaded to.
+            currently only used for weight-only quantization.
         kwargs (remaining dictionary of keyword arguments, optional):
             remaining dictionary of keyword arguments for loading huggingface models.
             Will be passed to the huggingface model's `__init__` method, such as 'trust_remote_code', 'revision'.
     Returns:
         The quantized model
     """
-    # TODO: When loading WOQ model, use different WeightOnlyLinear module according to device.
     if format == LoadFormat.DEFAULT.value:
         from neural_compressor.common.base_config import ConfigRegistry
 
@@ -99,18 +99,18 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu"
             ):  # WOQ
                 from neural_compressor.torch.algorithms import weight_only
 
-                return weight_only.load(model_name_or_path, original_model, format=LoadFormat.DEFAULT)
+                qmodel = weight_only.load(model_name_or_path, original_model, format=LoadFormat.DEFAULT, device=device)
+                return qmodel.to(device)
 
             original_model.qconfig = config_mapping
-            if isinstance(config_object, FP8Config):  # FP8
-                from neural_compressor.torch.algorithms import habana_fp8
-
-                return habana_fp8.load(model_name_or_path, original_model)
-
+            if isinstance(config_object, FP8Config):
+                # TODO: support loading FP8 model
+                raise NotImplementedError("`load` function for FP8 model is not supported yet.")
     elif format == LoadFormat.HUGGINGFACE.value:
         # now only support load huggingface WOQ causal language model
         from neural_compressor.torch.algorithms import weight_only
 
-        return weight_only.load(model_name_or_path, format=LoadFormat.HUGGINGFACE, **kwargs)
+        qmodel = weight_only.load(model_name_or_path, format=LoadFormat.HUGGINGFACE, device=device, **kwargs)
+        return qmodel.to(device)
     else:
         raise ValueError("`format` in load function can only be 'huggingface' or 'default', but get {}".format(format))
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index b210288c1e7..ae558f3f495 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -38,8 +38,9 @@
 # All constants for torch
 WHITE_MODULE_LIST = [torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d]
 
-
-WEIGHT_NAME = "quantized_model.pt"
+HPU_SAFE_WEIGHTS_NAME = "hpu_model.safetensors"
+WEIGHT_NAME = "quantized_weight.pt"
+HPU_WEIGHT_NAME = "quantized_hpu_weight.pt"
 QCONFIG_NAME = "qconfig.json"
 
 
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index b30c6c644bc..6d69ba75164 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -118,7 +118,8 @@ def test_autoround_with_quantize_API(self):
 
     def test_save_and_load(self):
         fp32_model = copy.deepcopy(self.gptj)
-        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp32")
+        # known issue: scale_dtype="fp32" will cause accuracy gap between quantized model (using auto-round WeightOnlyLinear) and reloaded model (using INCWeightOnlyLinear)
+        quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp16")
         # quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
         logger.info(f"Test AutoRound with config {quant_config}")
 
@@ -131,6 +132,7 @@ def test_save_and_load(self):
         q_model.save("saved_results")
         inc_out = q_model(self.inp)[0]
 
+        from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
         from neural_compressor.torch.quantization import load
 
         # loading compressed model
@@ -138,7 +140,7 @@ def test_save_and_load(self):
         loaded_out = loaded_model(self.inp)[0]
         assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
         assert isinstance(
-            loaded_model.transformer.h[0].attn.k_proj, WeightOnlyLinear
+            loaded_model.transformer.h[0].attn.k_proj, INCWeightOnlyLinear
         ), "loading compressed model failed."
 
     def test_conv1d(self):
diff --git a/test/3x/torch/quantization/weight_only/test_awq.py b/test/3x/torch/quantization/weight_only/test_awq.py
index c877288f7dc..56927fb35f0 100644
--- a/test/3x/torch/quantization/weight_only/test_awq.py
+++ b/test/3x/torch/quantization/weight_only/test_awq.py
@@ -8,7 +8,7 @@
 from neural_compressor.common import Logger
 
 logger = Logger().get_logger()
-from neural_compressor.torch.algorithms.weight_only.modules import MulLinear, WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.modules import MulLinear
 from neural_compressor.torch.quantization import AWQConfig, convert, get_default_awq_config, prepare, quantize
 from neural_compressor.torch.utils import accelerator
 
@@ -30,6 +30,13 @@ def calib_func(model):
     for i in range(2):
         model(example_inputs)
 
+def get_woq_linear_num(model, woq_module_type_name):
+    woq_linear_num = 0
+    for _, module in model.named_modules():
+        if module.__class__.__name__ == woq_module_type_name:
+            woq_linear_num += 1
+    return woq_linear_num
+
 
 class TestAWQQuant:
     @classmethod
@@ -106,6 +113,13 @@ def test_awq_with_quantize_API(self):
         ), "The results of calling `convert` + `prepare` and calling `quantize` should be equal."
 
     def test_save_and_load(self):
+        from neural_compressor.torch.quantization import load
+
+        @torch.no_grad()
+        def calib_func(model):
+            for i in range(2):
+                model(self.example_inputs)
+
         fp32_model = copy.deepcopy(self.tiny_gptj)
         quant_config = get_default_awq_config()
         # prepare + convert API
@@ -120,13 +134,14 @@ def test_save_and_load(self):
         q_model.save("saved_results")
         inc_out = q_model(self.example_inputs)[0]
 
-        from neural_compressor.torch.quantization import load
-
-        # loading compressed model
+        # loading compressed model (format=default, device="cpu")
+        # linear -> INCWeightOnlyLinear
         loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
-        loaded_out = loaded_model(self.example_inputs)[0]
-        assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
-        assert isinstance(loaded_model.transformer.h[0].mlp.fc_in, WeightOnlyLinear), "loading compressed model failed."
+        output = loaded_model(self.example_inputs)[0]
+        assert torch.allclose(inc_out, output), "Unexpected result. Please double check."
+        assert (
+            get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 30
+        ), "Incorrect number of INCWeightOnlyLinear modules"
 
     def test_quant_lm_head(self):
         # tie_word_embeddings=false
diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
index 5c15d16b790..92d7d0b790e 100644
--- a/test/3x/torch/quantization/weight_only/test_gptq.py
+++ b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -5,7 +5,6 @@
 import torch
 import transformers
 
-from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 from neural_compressor.torch.quantization import (
     GPTQConfig,
     convert,
@@ -23,6 +22,14 @@ def run_fn(model):
     model(torch.tensor([[10, 20, 30]], dtype=torch.long).to(device))
 
 
+def get_woq_linear_num(model, woq_module_type_name):
+    woq_linear_num = 0
+    for _, module in model.named_modules():
+        if module.__class__.__name__ == woq_module_type_name:
+            woq_linear_num += 1
+    return woq_linear_num
+
+
 class TestGPTQQuant:
     def setup_class(self):
         self.tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
@@ -256,6 +263,8 @@ def run_fn_conv1d(model):
         assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
 
     def test_save_and_load(self):
+        from neural_compressor.torch.quantization import load
+
         fp32_model = copy.deepcopy(self.tiny_gptj)
         quant_config = get_default_gptq_config()
         prepared_model = prepare(fp32_model, quant_config)
@@ -265,12 +274,11 @@ def test_save_and_load(self):
         q_model.save("saved_results")
         inc_out = q_model(self.example_inputs)[0]
 
-        from neural_compressor.torch.quantization import load
-
-        # loading compressed model
+        # loading compressed model (format=INC, device="cpu")
+        # linear -> INCWeightOnlyLinear
         loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
-        loaded_out = loaded_model(self.example_inputs)[0]
-        assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
-        assert isinstance(
-            loaded_model.transformer.h[0].attn.k_proj, WeightOnlyLinear
-        ), "loading compressed model failed."
+        output = loaded_model(self.example_inputs)[0]
+        assert torch.allclose(inc_out, output), "Unexpected result. Please double check."
+        assert (
+            get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 30
+        ), "Incorrect number of INCWeightOnlyLinear modules"
diff --git a/test/3x/torch/quantization/weight_only/test_load.py b/test/3x/torch/quantization/weight_only/test_load.py
new file mode 100644
index 00000000000..03b58b2adcc
--- /dev/null
+++ b/test/3x/torch/quantization/weight_only/test_load.py
@@ -0,0 +1,109 @@
+import copy
+import shutil
+
+import huggingface_hub
+import pytest
+import torch
+import transformers
+
+from neural_compressor.torch.quantization import load
+from neural_compressor.torch.utils import LoadFormat, accelerator, is_hpex_available
+
+device = accelerator.current_device_name()
+
+
+class TestHFModelLoad:
+    def setup_class(self):
+        self.model_name = "TheBloke/TinyLlama-1.1B-python-v0.1-GPTQ"
+        self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to(device)
+
+        self.local_hf_model = "./TinyLlama-1.1B-python-v0.1-GPTQ"
+        huggingface_hub.snapshot_download(self.model_name, local_dir=self.local_hf_model)
+
+    def teardown_class(self):
+        shutil.rmtree("TinyLlama-1.1B-python-v0.1-GPTQ", ignore_errors=True)
+        shutil.rmtree("saved_results", ignore_errors=True)
+
+    def get_woq_linear_num(self, model, woq_module_type_name):
+        woq_linear_num = 0
+        for _, module in model.named_modules():
+            if module.__class__.__name__ == woq_module_type_name:
+                woq_linear_num += 1
+        return woq_linear_num
+
+    def test_load_hf_woq_model_cpu(self):
+        # use huggingface model_id (format=huggingface, device="cpu")
+        qmodel = load(
+            model_name_or_path=self.model_name, format="huggingface", torch_dtype=torch.float32
+        )  # 'torch_dtype=torch.float32' for cpu test
+        assert (
+            self.get_woq_linear_num(qmodel, "INCWeightOnlyLinear") == 154
+        ), "Incorrect number of INCWeightOnlyLinear modules"
+        output = qmodel(self.example_inputs.to("cpu"))[0]
+        assert len(output) > 0, "Not loading the model correctly"
+
+        # use huggingface local model_path (format=huggingface, device="cpu")
+        qmodel = load(
+            model_name_or_path=self.local_hf_model, format="huggingface", torch_dtype=torch.float32
+        )  # 'torch_dtype=torch.float32' for cpu test
+        assert (
+            self.get_woq_linear_num(qmodel, "INCWeightOnlyLinear") == 154
+        ), "Incorrect number of INCWeightOnlyLinear modules"
+        output = qmodel(self.example_inputs.to("cpu"))[0]
+        assert len(output) > 0, "Not loading the model correctly"
+
+    @pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
+    def test_load_hf_woq_model_hpu(self):
+        # 1. use huggingface model_id (format=huggingface, device="hpu")
+        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
+        model = load(
+            model_name_or_path=self.model_name,
+            format="huggingface",
+            device="hpu",
+        )
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output1 = model(self.example_inputs)[0]
+
+        # second load: linear -> HPUWeightOnlyLinear using hpu_model.safetensors saved in local cache dir
+        model = load(
+            model_name_or_path=self.model_name,
+            format="huggingface",
+            device="hpu",
+        )
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output2 = model(self.example_inputs)[0]
+
+        assert torch.equal(
+            output1, output2
+        ), "The model loaded the second time is different from the model loaded the first time"
+
+        # 2. use huggingface local model_path (format=huggingface, device="hpu")
+        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
+        model = load(
+            model_name_or_path=self.local_hf_model,
+            format="huggingface",
+            device="hpu",
+        )
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output1 = model(self.example_inputs)[0]
+
+        # second load: linear -> HPUWeightOnlyLinear using hpu_model.safetensors saved in local cache dir
+        model = load(
+            model_name_or_path=self.local_hf_model,
+            format="huggingface",
+            device="hpu",
+        )
+        assert (
+            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output2 = model(self.example_inputs)[0]
+
+        assert torch.equal(
+            output1, output2
+        ), "The model loaded the second time is different from the model loaded the first time"
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index 293f11f6b8b..f0c2cf56ae6 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -5,7 +5,6 @@
 import torch
 import transformers
 
-from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
 from neural_compressor.torch.quantization import (
     RTNConfig,
     convert,
@@ -33,6 +32,14 @@ def forward(self, x):
         return out
 
 
+def get_woq_linear_num(model, woq_module_type_name):
+    woq_linear_num = 0
+    for _, module in model.named_modules():
+        if module.__class__.__name__ == woq_module_type_name:
+            woq_linear_num += 1
+    return woq_linear_num
+
+
 class TestRTNQuant:
     def setup_class(self):
         self.tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
@@ -327,6 +334,8 @@ def test_conv1d(self, bits, use_sym, group_size, group_dim):
             assert torch.allclose(out2, out1, atol=0.5), "Accuracy gap atol > 0.5 is unexpected."
 
     def test_save_and_load(self):
+        from neural_compressor.torch.quantization import load
+
         fp32_model = copy.deepcopy(self.tiny_gptj)
         quant_config = get_default_rtn_config()
         q_model = quantize(fp32_model, quant_config=quant_config)
@@ -334,13 +343,44 @@ def test_save_and_load(self):
         q_model.save("saved_results")
         inc_out = q_model(self.example_inputs)[0]
 
+        # loading compressed model (format=default, device="cpu")
+        # linear -> INCWeightOnlyLinear
+        loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
+        output = loaded_model(self.example_inputs)[0]
+        assert torch.allclose(inc_out, output), "Unexpected result. Please double check."
+        assert (
+            get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 31
+        ), "Incorrect number of INCWeightOnlyLinear modules"
+
+    @pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
+    def test_save_and_load_hpu(self):
         from neural_compressor.torch.quantization import load
 
-        # loading compressed model
-        loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
-        loaded_out = loaded_model(self.example_inputs)[0]
-        assert torch.allclose(inc_out, loaded_out), "Unexpected result. Please double check."
-        assert isinstance(loaded_model.transformer.h[0].mlp.fc_in, WeightOnlyLinear), "loading compressed model failed."
+        fp32_model = copy.deepcopy(self.tiny_gptj)
+        quant_config = get_default_rtn_config()
+        q_model = quantize(fp32_model, quant_config=quant_config)
+        assert q_model is not None, "Quantization failed!"
+        q_model.save("saved_results")
+        inc_out = q_model(self.example_inputs)[0]
+
+        # loading compressed model (format=default, device="hpu")
+        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save quantized_hpu_weight.pt to local cache dir
+        loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj), device="hpu")
+        assert (
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output1 = loaded_model(self.example_inputs)[0]
+
+        # second load: linear -> HPUWeightOnlyLinear using quantized_hpu_weight.pt saved in local cache dir
+        loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj), device="hpu")
+        assert (
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
+        ), "Incorrect number of HPUWeightOnlyLinear modules"
+        output2 = loaded_model(self.example_inputs)[0]
+
+        assert torch.equal(
+            output1, output2
+        ), "The model loaded the second time is different from the model loaded the first time"
 
     def test_no_transformers(self, monkeypatch):
         def mock_is_transformers_imported():

From a14c5c68b0cbac01f7a87d909d2f707c34c57496 Mon Sep 17 00:00:00 2001
From: yan tomsinsky <ytomsinsky@habana.ai>
Date: Tue, 9 Jul 2024 12:31:07 +0300
Subject: [PATCH 17/51] [SW-190303] Implement HPUWeightOnlyLinear class in INC

Change-Id: Ie05c8787e708e2c3559dce24ef0758d6c498ac41
---
 .../torch/algorithms/weight_only/modules.py   | 135 ++++++++++++++++--
 1 file changed, 121 insertions(+), 14 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 3efe65189f1..d8b5910cb59 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -812,11 +812,7 @@ def extra_repr(self) -> str:
             tmp_str += ", use_optimum_format=True"
         return tmp_str
 
-
-# TODO: implement HPUWeightOnlyLinear
-# temporarily let HPUWeightOnlyLinear inherit INCWeightOnlyLinear
-# should be 'class HPUWeightOnlyLinear(WeightOnlyLinear)'
-class HPUWeightOnlyLinear(INCWeightOnlyLinear):
+class HPUWeightOnlyLinear(WeightOnlyLinear):
     def __init__(
         self,
         in_features,
@@ -826,7 +822,7 @@ def __init__(
         group_size=32,
         zp=False,
         bias=False,
-        scale_dtype=torch.float32,
+        scale_dtype=torch.bfloat16,
         compression_dtype=torch.int32,
         compression_dim=1,
         g_idx=False,
@@ -840,17 +836,128 @@ def __init__(
             dtype,
             bits,
             group_size,
-            zp,
-            bias,
-            scale_dtype,
-            compression_dtype,
-            compression_dim,
-            g_idx,
             device,
-            use_optimum_format,
-            **kwargs,
+        )
+        self.float_type = torch.bfloat16
+        self.compression_dim = compression_dim
+        self.compression_dtype = compression_dtype
+
+        if bits != 4:
+            raise NotImplementedError("Only 4 bits are supported.")
+        self.maxq = 2**self.bits - 1
+
+        if bias:
+            self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(self.device))
+        else:
+            self.bias = None
+
+        self.register_buffer(
+            "qweight",
+            torch.zeros((in_features, out_features // 32 * self.bits), dtype=self.compression_dtype).to(self.device),
         )
 
+        self.register_buffer(
+            "qzeros",
+            torch.zeros(
+                (
+                    math.ceil(in_features / self.group_size),
+                    out_features // 32 * self.bits,
+                ),
+                dtype=self.compression_dtype,
+            ),
+        )
+        self.register_buffer(
+            "scales",
+            torch.zeros(
+                (math.ceil(in_features / self.group_size), out_features),
+                dtype=self.float_type,
+            ),
+        )
+
+        if g_idx:
+            self.register_buffer(
+                "g_idx",
+                torch.tensor([i // self.group_size for i in range(in_features)], dtype=torch.int32),
+            )
+        else:
+            self.g_idx = None
+
+        self.half_indim = self.in_features // 2
+
+        self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
+
+    def forward(self, input):
+        input_dtype = input.dtype
+        output_shape = input.shape[:-1] + (self.out_features,)
+        scales = self.scales
+        qweight = self.qweight
+        zeros = self.qzeros
+        weight = torch.ops.hpu.convert_from_uint4(qweight, scales, zeros, input_dtype)
+        output = torch.matmul(input, weight)
+        output = output.to(dtype=input_dtype).reshape(
+            output_shape
+
+        )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
+        output = output + self.bias if self.bias is not None else output
+        return output
+
+
+    def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
+        logger.debug(f"Packing for HPU")
+
+        scales = scales.T.contiguous()
+        qzeros = zp.T.contiguous()
+        qweight = int_weight.T.contiguous()
+
+        self.scales = scales.to(dtype=torch.bfloat16)
+
+        # weights and zp are on device from unpack, need to load to cpu for packing
+        self.qweight = qweight.cpu()
+        new_qweight = self.pack_tensor(self.qweight)
+        self.qweight = new_qweight.to("hpu")
+
+        self.qzeros = qzeros.cpu()
+        new_qzeros = self.pack_tensor(self.qzeros)
+        self.qzeros = new_qzeros.to("hpu")
+
+        if bias is not None:
+            self.bias = bias.to("hpu").to(torch.bfloat16)
+
+    def unpack(self):
+        logger.debug(f"Unpacking from HPU")
+        self.qweight = self.qweight.cpu()
+        weight = torch.bitwise_right_shift(
+                torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+                self.wf.unsqueeze(-1),
+            ).to(torch.int16 if self.bits == 8 else torch.int8)
+        weight = torch.bitwise_and(weight, (2**self.bits) - 1)
+        weight = weight.reshape((weight.shape[0]*weight.shape[1], weight.shape[2]))
+        self.qweight = self.qweight.to(self.device)
+
+        zeros = torch.bitwise_right_shift(
+            torch.unsqueeze(self.qzeros, 2).expand(-1, -1, 32 // self.bits),
+            self.wf.unsqueeze(0),
+        ).to(torch.int16 if self.bits == 8 else torch.int8)
+
+        zeros = torch.bitwise_and(
+            zeros, (2**self.bits) - 1
+        ).to(self.scales.dtype)  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
+        zeros = zeros + 1
+        zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
+        return weight, zeros
+
+    def pack_tensor(self, input, bits = 4):
+        normal = input.to(torch.int32)
+        q = torch.zeros((normal.shape[0], normal.shape[1] // 32 * bits), dtype=torch.int32)
+        i = 0
+        col = 0
+        while col < q.shape[1]:
+            for j in range(i, i + (32 // bits)):
+                q[:, col] |= normal[:, j] << (bits * (j - i))
+            i += 32 // bits
+            col += 1
+        q = q.to(torch.int32)
+        return q
 
 class FakeAffineTensorQuantFunction(Function):
     """Fake version of affine quantization."""

From 35b5bd2280c6e0535727cfda6dc898905ba8e57a Mon Sep 17 00:00:00 2001
From: Zhou Yuwen <zyuwen@habana.ai>
Date: Mon, 15 Jul 2024 09:02:41 +0000
Subject: [PATCH 18/51] [SW-192809] fix json_file bug when instantiating
 FP8Config class

Change-Id: I4a715d0a706efe20ccdb49033755cabbc729ccdc
Signed-off-by: Zhou Yuwen <zyuwen@habana.ai>
---
 .../fp8_quant/_quant_common/quant_config.py   | 33 ++++++++--
 .../torch/quantization/config.py              | 19 +++---
 .../test_fp8_jsons/test_hw_quant.json         | 16 +++++
 .../test_fp8_jsons/test_measure.json          | 15 +++++
 .../fp8_quant/test_fp8_static_quant.py        | 66 +++++++++++++++++++
 5 files changed, 137 insertions(+), 12 deletions(-)
 create mode 100644 test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_hw_quant.json
 create mode 100644 test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_measure.json
 create mode 100644 test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
index 901c3eed053..1cf343e1a22 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -1,13 +1,28 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import annotations
 
 import json
 import os
-import torch
-from enum import Enum, Flag, auto
 from dataclasses import dataclass
+from enum import Enum, Flag, auto
 from json.decoder import JSONDecodeError
 from typing import Any, Mapping
+
 import habana_frameworks.torch.utils.experimental as htexp
+import torch
 
 from ..utils.logger import logger
 
@@ -121,6 +136,16 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
                 else:
                     raise ValueError("invalid fp8_config in custom config. Enter E4M3 or E5M2")
 
+            if keys == "hp_dtype":
+                if custom_config[keys].lower() == "bf16":
+                    custom_config[keys] = torch.bfloat16
+                elif custom_config[keys].lower() == "fp16":
+                    custom_config[keys] = torch.float16
+                elif custom_config[keys].lower() == "fp32":
+                    custom_config[keys] = torch.float32
+                else:
+                    raise ValueError("invalid hp_dtype in custom config. Enter bf16, fp16 or fp32")
+
             if keys == "scale_method":
                 if custom_config[keys].lower() == "unit_scale":
                     custom_config[keys] = ScaleMethod.UNIT_SCALE
@@ -176,7 +201,7 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
         # If seperate_measure_files is True (default value), then it is assumed that there are multiple distinct measure and scale files
         # and they are stored in / loaded from paths with the correct index as a suffix. Else, only one is searched for.
         measured_global_config["local_rank"] = (
-            local_rank if local_rank >= 0 and (custom_config.get("seperate_measure_files", True) == True) else None
+            local_rank if local_rank >= 0 and custom_config.get("seperate_measure_files", True) else None
         )
 
         base_name = measured_global_config["dump_stats_path"].split("/")[-1]
@@ -185,7 +210,7 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
         os.makedirs(folder_name, exist_ok=True)
         worker_st = (
             ""
-            if measured_global_config["local_rank"] == None
+            if measured_global_config["local_rank"] is None
             else "_" + str(measured_global_config["local_rank"]) + "_" + str(measured_global_config["world_size"])
         )
         measured_global_config["shape_file"] = measured_global_config["dump_stats_path"] + "_hooks_shape" + worker_st
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index d8aefe1f3ff..3219be7643c 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -1635,7 +1635,7 @@ def __init__(
         self,
         dump_stats_path: str = "./hqt_output/measure",
         fp8_config: str = "E4M3",
-        hp_dtype: torch.dtype = torch.bfloat16,
+        hp_dtype: str = "bf16",
         blocklist: dict = {'names': [], 'types': ()},
         allowlist: dict = {'names': [], 'types': FP8_WHITE_LIST},
         mode: str = "AUTO",
@@ -1670,13 +1670,6 @@ def quantize(self):
 
     @property
     def json_file(self):
-        if self._json_file is None:
-            import tempfile
-            from pathlib import Path
-
-            json_file_tmp = tempfile.NamedTemporaryFile(suffix=".json")
-            self.to_json_file(json_file_tmp.name)
-            self.json_file(json_file_tmp.name)
         return self._json_file
 
     @json_file.setter
@@ -1691,6 +1684,14 @@ def from_json_file(cls, filename):
         config.json_file = filename
         return config
 
+    def save_temp_json_file(self):
+        import tempfile
+        from pathlib import Path
+
+        json_file_tmp = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
+        self.to_json_file(json_file_tmp.name)
+        self._json_file = json_file_tmp.name
+
     @classmethod
     def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]:
         # just a simple example here
@@ -1737,6 +1738,8 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
     def to_config_mapping(
         self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
     ):
+        if self.json_file is None:
+            self.save_temp_json_file()
         config_mapping = OrderedDict()
         if config_list is None:
             config_list = [self]
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_hw_quant.json b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_hw_quant.json
new file mode 100644
index 00000000000..eb4f8e8208e
--- /dev/null
+++ b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_hw_quant.json
@@ -0,0 +1,16 @@
+{
+    "mode": "QUANTIZE",
+    "observer": "maxabs",
+    "scale_method": "maxabs_hw",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
\ No newline at end of file
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_measure.json b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_measure.json
new file mode 100644
index 00000000000..2631c739f63
--- /dev/null
+++ b/test/3x/torch/quantization/fp8_quant/test_fp8_jsons/test_measure.json
@@ -0,0 +1,15 @@
+{
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "allowlist": {
+        "types": [],
+        "names": []
+    },
+    "blocklist": {
+        "types": [],
+        "names": [
+            "lm_head"
+        ]
+    },
+    "dump_stats_path": "./test_outputs/unit_test"
+}
\ No newline at end of file
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
new file mode 100644
index 00000000000..eb71a550782
--- /dev/null
+++ b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
@@ -0,0 +1,66 @@
+import copy
+import shutil
+
+import pytest
+import torch
+import transformers
+
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedLinear
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    convert,
+    finalize_calibration,
+    get_default_fp8_config,
+    prepare,
+    quantize,
+)
+from neural_compressor.torch.utils import is_hpex_available
+
+
+@torch.no_grad()
+def calib_func(model):
+    example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to("hpu")
+    for i in range(2):
+        model(example_inputs)
+
+
+@pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
+class TestFP8StaticQuant:
+    def setup_class(self):
+        self.tiny_gptj = transformers.AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-GPTJForCausalLM",
+            device_map="cpu",
+        )
+        self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long)
+
+    def teardown_class(self):
+        shutil.rmtree("test_ouputs", ignore_errors=True)
+
+    def test_one_step_quant(self):
+        model = copy.deepcopy(self.tiny_gptj)
+        qconfig = FP8Config(fp8_config="E4M3")
+        model = prepare(model, qconfig)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not prepared."
+        calib_func(model)
+        model = convert(model)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not quantized."
+        assert (
+            model.transformer.h[0].attn.k_proj.quant_input.lp_dtype == torch.float8_e4m3fn
+        ), "k_proj input dtype is not torch.float8_e4m3fn."
+
+    def test_two_step_quant(self):
+        # step 1: measurement
+        model = copy.deepcopy(self.tiny_gptj)
+        config = FP8Config.from_json_file("test_fp8_jsons/test_measure.json")
+        model = prepare(model, config)
+        calib_func(model)
+        finalize_calibration(model)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not observed."
+        # step 2: quantize based on measurement
+        model = copy.deepcopy(self.tiny_gptj)
+        config = FP8Config.from_json_file("test_fp8_jsons/test_hw_quant.json")
+        model = convert(model, config)
+        assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not quantized."
+        assert (
+            model.transformer.h[0].attn.k_proj.quant_input.lp_dtype == torch.float8_e4m3fn
+        ), "k_proj input dtype is not torch.float8_e4m3fn."

From f45e0aa2a7fb202223a73dfff4a5e5825b24e9d6 Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 16 Jul 2024 09:16:50 +0300
Subject: [PATCH 19/51] [SW-192931] align setup.py with github INC and remove
 fp8_convert

Change-Id: Ibbc157646cfcfad64b323ecfd96b9bbda5ba9e2f
Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 setup.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index d3c09c67ee3..42b471dc283 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,7 @@ def get_build_version():
         return __version__
     try:
         result = subprocess.run(["git", "describe", "--tags"], capture_output=True, text=True, check=True)
-        distance = result.stdout.strip().split("-")[-2]
-        commit = result.stdout.strip().split("-")[-1]
+        _, distance, commit = result.stdout.strip().split("-")
         return f"{__version__}.dev{distance}+{commit}"
     except subprocess.CalledProcessError:
         return __version__
@@ -136,15 +135,16 @@ def get_build_version():
         description="Repository of Intel® Neural Compressor",
         long_description=open("README.md", "r", encoding="utf-8").read(),
         long_description_content_type="text/markdown",
+        keywords="quantization,auto-tuning,post-training static quantization,"
+        "post-training dynamic quantization,quantization-aware training",
         license="Apache 2.0",
-        keywords="quantization",
-        url="",
+        url="https://github.com/intel/neural-compressor",
         packages=include_packages,
         include_package_data=True,
         package_data=package_data,
         install_requires=install_requires,
-        ext_modules=ext_modules,  # for fp8
-        cmdclass=cmdclass,  # for fp8
+        ext_modules=ext_modules,
+        cmdclass=cmdclass,
         entry_points=entry_points,
         extras_require=extras_require,
         python_requires=">=3.7.0",

From 165ce633e06d8327d189a501b557b55949a3238d Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 16 Jul 2024 06:16:34 +0300
Subject: [PATCH 20/51] [SW-192917] Update all HQT logic files with pre-commit
 check

Change-Id: I119dc8578cb10932fd1a8a674a8bdbf61f978e42
Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 .../algorithms/fp8_quant/_core/__init__.py    | 13 +++
 .../algorithms/fp8_quant/_core/common.py      | 31 +++++-
 .../algorithms/fp8_quant/_core/fp_utils.py    | 27 +++--
 .../algorithms/fp8_quant/_core/measure.py     | 44 ++++++---
 .../fp8_quant/_core/quant_dequant.py          | 18 +++-
 .../algorithms/fp8_quant/_core/quantize.py    | 30 ++++--
 .../torch/algorithms/fp8_quant/_core/scale.py | 41 ++++----
 .../fp8_quant/_core/scale_methods/__init__.py | 14 +++
 .../fp8_quant/_core/scale_methods/max_abs.py  | 18 +++-
 .../_core/scale_methods/smooth_quant.py       | 16 ++-
 .../_core/scale_methods/unit_scale.py         | 16 ++-
 .../torch/algorithms/fp8_quant/_core/utils.py | 98 ++++++++++++-------
 .../fp8_quant/_quant_common/__init__.py       | 13 +++
 .../fp8_quant/_quant_common/helper_modules.py | 22 ++++-
 .../torch/algorithms/fp8_quant/common.py      | 16 +--
 .../torch/algorithms/fp8_quant/fp8_quant.py   |  8 +-
 .../algorithms/fp8_quant/helper_modules.py    | 15 +++
 .../fp8_quant/prepare_quant/__init__.py       | 13 +++
 .../fp8_quant/prepare_quant/prepare_model.py  | 27 +++--
 .../algorithms/fp8_quant/scripts/__init__.py  | 13 +++
 .../scripts/regression_detection/__init__.py  | 13 +++
 .../regression_detection.py                   | 21 +++-
 .../algorithms/fp8_quant/utils/__init__.py    | 13 +++
 .../algorithms/fp8_quant/utils/logger.py      | 21 +++-
 .../torch/quantization/config.py              | 52 ++++++----
 .../torch/quantization/quantize.py            |  7 +-
 .../torch/algorithms/fp8_quant/fp8_tests.py   |  5 +-
 test/3x/torch/algorithms/fp8_quant/tester.py  | 29 ++----
 .../fp8_quant/unit_tests/__init__.py          |  3 +-
 .../fp8_quant/unit_tests/test_deepspeed.py    |  5 +-
 .../test_functions/test_config_json.py        | 10 +-
 .../test_functions/test_matmul_fp8.py         |  6 +-
 .../unit_tests/test_layers/test_conv2d.py     |  3 +-
 .../unit_tests/test_layers/test_linear.py     |  3 +-
 .../unit_tests/test_layers/test_matmul.py     |  7 +-
 35 files changed, 494 insertions(+), 197 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_core/__init__.py
index e69de29bb2d..28f108cb636 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
index d461e513887..cefe46e77f0 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -1,9 +1,24 @@
-import os
-import torch
-import json
-import numpy as np
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import functools
 import importlib.util
+import json
+import os
+
+import numpy as np
+import torch
 
 from .._quant_common.helper_modules import *
 from .._quant_common.quant_config import get_hqt_config
@@ -117,7 +132,13 @@ def save_file(model, d, source_format, fname, mode):
 def module_convert(m, fcn):
     mt = ModuleConfig(
         tuple([fcn(x) for x in m.inputs]),
-        tuple([fcn(m.outputs)],) if type(m.outputs) == np.ndarray else tuple([fcn(y) for y in m.outputs]),
+        (
+            tuple(
+                [fcn(m.outputs)],
+            )
+            if type(m.outputs) == np.ndarray
+            else tuple([fcn(y) for y in m.outputs])
+        ),
         {k: fcn(m.params[k]) for k in m.params},
     )
     return mt
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
index 14f54d4eaa8..67ca91b7684 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/fp_utils.py
@@ -1,6 +1,21 @@
-import torch
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import habana_frameworks.torch.core as htcore
 import habana_frameworks.torch.utils.experimental as htexp
+import torch
+
 from .common import *
 
 GAUDI2 = htexp.synDeviceType.synDeviceGaudi2
@@ -37,7 +52,7 @@ def get_default_exp_bias(dtype):
 def get_fullscale(dtype, exp_bias=None):
     default_exp_bias = get_default_exp_bias(dtype)
     fullscale = MAX_RANGE[dtype]
-    exp_bias = default_exp_bias if exp_bias == None else exp_bias
+    exp_bias = default_exp_bias if exp_bias is None else exp_bias
     fullscale = fullscale * (2 ** (default_exp_bias - exp_bias))
     return fullscale
 
@@ -50,7 +65,7 @@ def get_fp8_hw_alligned_scales(dtype, device):
     exp_bias_set = EXP_BIAS_SETS.get((device, dtype), None)
     return (
         None
-        if exp_bias_set == None
+        if exp_bias_set is None
         else [x / MAX_RANGE[dtype] for x in get_fullscales_by_expbias_set(dtype, exp_bias_set)]
     )
 
@@ -82,11 +97,11 @@ def scale_to_pow2(scale):
     return scale_pow2
 
 
-# Considering range of hw alligned scales: 2^a, 2^a+1,..., 2^b (a<b)
+# Considering range of hw aligned scales: 2^a, 2^a+1,..., 2^b (a<b)
 # we want to choose scale s for maxabs m such that 2^a <= s=2^x <= 2^b (for integer a<=x<=b)
 # and also 2^(x-1) < m <= 2^x
-# if m>=2^b then s=2^b, therefor min(_, 2^b)
-# if m<=2^a then s=2^a, therefor max(_, 2^a) --> 2^a <= min(max(_,2^a),2^b) <=2^b
+# if m>=2^b then s=2^b, therefore min(_, 2^b)
+# if m<=2^a then s=2^a, therefore max(_, 2^a) --> 2^a <= min(max(_,2^a),2^b) <=2^b
 # if s^a<m<2^b then m as a positive number can be written as m=2^y (y=log2(m))
 # if y is integer then y=ciel(y) we choose x=y so s=2^x=2^y=2^ciel(y)=2^ciel(log2(m))
 # else we choose x=ciel(y) and a<=x-1<y<x<=b and s=2^x=2^ciel(y)=2^ciel(log2(m))
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
index ae0eb5423b5..7a87e587e99 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -1,18 +1,27 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import os
-import torch
-import numpy as np
+
 import habana_frameworks.torch.core as htcore
+import numpy as np
+import torch
 
-from .._quant_common.quant_config import (
-    QuantMode,
-    ScaleMethod,
-    MeasureExclude,
-    get_hqt_config,
-    set_hqt_config,
-)
-from .common import *
+from .._quant_common.quant_config import MeasureExclude, QuantMode, ScaleMethod, get_hqt_config, set_hqt_config
 from ..utils.logger import logger
+from .common import *
 
 imod_dict = {}
 gmod_list = []
@@ -139,7 +148,7 @@ def get_mod_extra_config_dict(model):
     for name, mod in model.named_modules():
         if hasattr(mod, "_mod_extra_config"):
             if is_measure_done(mod._mod_extra_config):
-                name = name.replace("_orig_mod.", "") # remove _orig_mod part added by dynamo mechanism
+                name = name.replace("_orig_mod.", "")  # remove _orig_mod part added by dynamo mechanism
                 mcd[name] = mod._mod_extra_config
             else:
                 logger.debug(
@@ -181,9 +190,7 @@ def measure_control_to_state_dict(mcd):
             sdl[mname]["params"] = dict()
             for param_name in mcd[mname].params:
                 if mcd[mname].params[param_name].state is not None:
-                    sd[mname]["params"][param_name] = (
-                        mcd[mname].params[param_name].state.detach().cpu().float().numpy()
-                    )
+                    sd[mname]["params"][param_name] = mcd[mname].params[param_name].state.detach().cpu().float().numpy()
                     sdl[mname]["params"][param_name] = (
                         mcd[mname].params[param_name].state.detach().cpu().float().numpy().tolist()
                     )
@@ -409,6 +416,13 @@ def is_used(self):
 observer_params = {
     "maxabs_per_channel": {
         "linear": ModuleConfig(({"dim": -1},), ({"dim": -1},), {"weight": {"dim": 0}}),
-        "matmul": ModuleConfig(({"dim": -1}, {"dim": -2},), ({"dim": -1},), None),
+        "matmul": ModuleConfig(
+            (
+                {"dim": -1},
+                {"dim": -2},
+            ),
+            ({"dim": -1},),
+            None,
+        ),
     }
 }
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
index 50b604b7d89..0f32be4b00c 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quant_dequant.py
@@ -1,5 +1,21 @@
-import torch.nn as nn
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from abc import abstractmethod
+
+import torch.nn as nn
+
 from .common import *
 
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
index 76ee0a1d635..efe412cc16c 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/quantize.py
@@ -1,17 +1,27 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import habana_frameworks.torch.core as htcore
 import torch
 import torch.nn as nn
-import habana_frameworks.torch.core as htcore
-from .._quant_common.quant_config import get_hqt_config
+
 from .._quant_common.helper_modules import PatchedUnmeasuredModule
-from .measure import load_measurements
-from .scale import scale_method_mapping, get_config, scaling_methods
-from .common import (
-    mod_default_dict,
-    generate_model_info,
-    parent_child_mod_dict,
-    UNMEASURED_MODELS,
-)
+from .._quant_common.quant_config import get_hqt_config
 from ..utils.logger import logger
+from .common import UNMEASURED_MODELS, generate_model_info, mod_default_dict, parent_child_mod_dict
+from .measure import load_measurements
+from .scale import get_config, scale_method_mapping, scaling_methods
 
 
 def patch_module(mod, qconfig, mod_dict, patched_mod=None):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
index c0b51cd9e74..67491b42e8e 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale.py
@@ -1,13 +1,26 @@
-import torch
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
+import torch
 
 from .._quant_common.quant_config import ScaleMethod, set_hqt_config
-from .scale_methods import *
-from .quant_dequant import *
-
-from .fp_utils import *
-from .common import *
 from ..utils.logger import logger
+from .common import *
+from .fp_utils import *
+from .quant_dequant import *
+from .scale_methods import *
 
 
 def matmul_scales_to_mod_config(mod, scales, params):
@@ -109,7 +122,7 @@ def get_config(
             mod_type_str = mod.__class__.__name__
             layer_type = mod_dict[mod_type_str].type
             if mname not in scales:
-                logger.debug("Calcuating scales for layer %s", mname)
+                logger.debug("Calculating scales for layer %s", mname)
                 if mname not in measurement:
                     qconfig[UNMEASURED_MODELS].append(mname)
                     logger.debug(
@@ -118,7 +131,7 @@ def get_config(
                     )
                     continue
                 layer_measure = measurement[mname]  # ModuleConfig() of measurements
-                scales[mname] = method[layer_type][0](mod, layer_measure, params)   # ModuleConfig() of scales
+                scales[mname] = method[layer_type][0](mod, layer_measure, params)  # ModuleConfig() of scales
                 if scales_file is not None:
                     scales_obj[mname] = ModuleConfig(
                         **format_functions_rec((torch.Tensor, scales_file_format))(scales[mname].__dict__)
@@ -366,18 +379,6 @@ def get_config(
         ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2,
         "maxabs",
     ): "act_maxabs_pts_pow2_weights_opt_pcs_pow2",
-    (
-        ScaleMethod.ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2,
-        "maxabs_per_channel",
-    ): "act_maxabs_pts_pow2_weights_opt_pcs_pow2",
-    (
-        ScaleMethod.WEAKSMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
-        "maxabs_per_channel",
-    ): "weaksmoothquant_weights_maxabs_pow2",
-    (
-        ScaleMethod.SMOOTHQUANT_WEIGHTS_OUTPUT_CHANNEL_MAXABS_POW2,
-        "maxabs_per_channel",
-    ): "smoothquant_weights_maxabs_pow2",
     (ScaleMethod.SMOOTHQUANT_OPT, "maxabs_per_channel"): "smoothquant_weights_opt_pow2",
 }
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py
index 1c0b11e3c99..23d3c7686d4 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .max_abs import *
 from .unit_scale import *
 from .smooth_quant import *
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py
index d991a68aca2..fd295a07374 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/max_abs.py
@@ -1,7 +1,21 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
-from ..fp_utils import *
 from ..common import *
+from ..fp_utils import *
 
 
 def linear_act_maxabs_pts_weight_maxabs_pts_pow2_hw_scales(mod, measurement, params):
@@ -140,7 +154,7 @@ def fsdpa_act_maxabs_pts_weight_maxabs_pts_pow2_scales(mod, measurement, params)
         for x in measurement.inputs
     ]
     # fsdpa is combined out of - BMM1(Q,K) -> Softmax -> BMM2(AMAX,V)
-    # during measure we recieve the amax value from the cguid and apply it during quant as input
+    # during measure we receive the amax value from the cguid and apply it during quant as input
     input_scale.append(
         calc_maxabs_scale(
             torch.tensor(measurement.outputs[1], dtype=hp_dtype, device=device).max(),
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py
index 3a216e6ef15..0c3e5f8cd67 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/smooth_quant.py
@@ -1,8 +1,22 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 from tqdm import tqdm
 
-from ..fp_utils import *
 from ..common import *
+from ..fp_utils import *
 
 
 def linear_smoothquant_weights_opt_pow2_scales(mod, measurement, params):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py
index 6be7673aace..4ced867fe4a 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/unit_scale.py
@@ -1,7 +1,21 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
-from ..fp_utils import *
 from ..common import *
+from ..fp_utils import *
 
 
 def linear_unit_scale_scales(mod, measurement, params):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
index 8c0a786d53b..30635109c2e 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_core/utils.py
@@ -1,49 +1,73 @@
-from .measure import prepare_model as prepare_model_for_measure
-from .quantize import quantize
-from .scale import scaling_params, scale_method_mapping
-from .._quant_common.quant_config import QuantMode, get_hqt_config
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from .._quant_common.helper_modules import *
+from .._quant_common.quant_config import QuantMode, get_hqt_config
 from ..utils.logger import logger
 from .common import mod_default_dict
+from .measure import prepare_model as prepare_model_for_measure
+from .quantize import quantize
+from .scale import scale_method_mapping, scaling_params
+
 
 def update_mod_dict(config):
-  assert len(config.cfg['mod_dict']) == 0, f"Custom modules are not supported: {config.cfg['mod_dict'].keys()}. Please add it in the code."
-  config.cfg['mod_dict'].update({k: mod_default_dict[k].type for k in mod_default_dict})
+    assert (
+        len(config.cfg["mod_dict"]) == 0
+    ), f"Custom modules are not supported: {config.cfg['mod_dict'].keys()}. Please add it in the code."
+    config.cfg["mod_dict"].update({k: mod_default_dict[k].type for k in mod_default_dict})
+
 
 def print_init_info(config):
-  import importlib.metadata
-  versionStr = importlib.metadata.version('neural_compressor_3x_pt')
-  locationStr = versionStr.find('git') + 3
-  logger.info("neural_compressor_3x_pt Git revision = %s", versionStr[locationStr:])
-  logger.info("neural_compressor_3x_pt Configuration = %s", config)
+    import importlib.metadata
+
+    versionStr = importlib.metadata.version("neural_compressor_3x_pt")
+    locationStr = versionStr.find("git") + 3
+    logger.info("neural_compressor_3x_pt Git revision = %s", versionStr[locationStr:])
+    logger.info("neural_compressor_3x_pt Configuration = %s", config)
+
 
 def is_substr(substr_list, target):
-  return any([x in target for x in substr_list])
+    return any([x in target for x in substr_list])
+
 
 def prepare_model(model):
-  config = get_hqt_config(model)
-  update_mod_dict(config)
-  allowlist=set(config.cfg['mod_dict'].keys())
-  blocklist=set()
-  for type_st in config.cfg['blocklist']['types']:
-    blocklist.add(type_st)
-  allowlist.difference_update(blocklist)
-  allowlist_tuple=tuple(allowlist)
-  mod_list=[]
-  for name, mod in model.named_modules():
-    mod_type=mod.__class__.__name__
-    if (mod_type in allowlist_tuple) and (is_substr(config.cfg['allowlist']['names'], name) or len(config.cfg['allowlist']['names'])==0) and (not is_substr(config.cfg['blocklist']['names'], name)):
-      mod_list.append(name)
-
-  print_init_info(config)
-
-  logger.debug("Module list: %s", mod_list)
-  logger.info("Total modules : %d", len(mod_list))
-  if (config.cfg['mode']==QuantMode.MEASURE) or (config.cfg['mode']==QuantMode.SHAPE):
-    return prepare_model_for_measure(model, mod_list)
-  elif config.cfg['mode']==QuantMode.QUANTIZE:
-    scaling_method_name = scale_method_mapping[(config.cfg['scale_method'], config.cfg['observer'])]
-    scaling_params[scaling_method_name].update(config.cfg['scale_params'])
-    config.cfg['scale_params'] = scaling_params[scaling_method_name]
-    return quantize(model, mod_list)
+    config = get_hqt_config(model)
+    update_mod_dict(config)
+    allowlist = set(config.cfg["mod_dict"].keys())
+    blocklist = set()
+    for type_st in config.cfg["blocklist"]["types"]:
+        blocklist.add(type_st)
+    allowlist.difference_update(blocklist)
+    allowlist_tuple = tuple(allowlist)
+    mod_list = []
+    for name, mod in model.named_modules():
+        mod_type = mod.__class__.__name__
+        if (
+            (mod_type in allowlist_tuple)
+            and (is_substr(config.cfg["allowlist"]["names"], name) or len(config.cfg["allowlist"]["names"]) == 0)
+            and (not is_substr(config.cfg["blocklist"]["names"], name))
+        ):
+            mod_list.append(name)
+
+    print_init_info(config)
+
+    logger.debug("Module list: %s", mod_list)
+    logger.info("Total modules : %d", len(mod_list))
+    if (config.cfg["mode"] == QuantMode.MEASURE) or (config.cfg["mode"] == QuantMode.SHAPE):
+        return prepare_model_for_measure(model, mod_list)
+    elif config.cfg["mode"] == QuantMode.QUANTIZE:
+        scaling_method_name = scale_method_mapping[(config.cfg["scale_method"], config.cfg["observer"])]
+        scaling_params[scaling_method_name].update(config.cfg["scale_params"])
+        config.cfg["scale_params"] = scaling_params[scaling_method_name]
+        return quantize(model, mod_list)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/__init__.py
index e69de29bb2d..28f108cb636 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
index 435d4389199..8957096bbc4 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -1,5 +1,19 @@
-import torch.nn as nn
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
+import torch.nn as nn
 
 from .quant_config import QuantMode, get_hqt_config
 
@@ -534,7 +548,7 @@ def fetch_from_cache(self, cache, blocks, permutations):
         quant_cache = self.quant_input(cache)
         output_cache = self.orig_fetch_from_cache(quant_cache, blocks, permutations)
         for i in range(len(output_cache)):
-            output_cache[i]=self.quant_output(output_cache[i])
+            output_cache[i] = self.quant_output(output_cache[i])
         return output_cache
 
 
@@ -700,7 +714,7 @@ def extra_repr(self) -> str:
 class PatchedModuleFusedSDPA(nn.Module):
     def __init__(self, mod, mod_extra_config, *args, **kwargs):
         # fsdpa is combined out of - BMM1(Q,K) -> Softmax -> BMM2(AMAX,V)
-        # during measure we recieve the amax value from the cguid and apply it during quant as input
+        # during measure we receive the amax value from the cguid and apply it during quant as input
         super().__init__()
         set_attrs_from_orig_model(self, mod, mod_extra_config)
         if self.quantization_mode == QuantMode.QUANTIZE:
@@ -812,4 +826,4 @@ def forward(self, *args, **kwargs):
         )
 
     def extra_repr(self) -> str:
-        return f"Dummy patch of {self.name} to raise excption as there are no measurements provided."
+        return f"Dummy patch of {self.name} to raise exception as there are no measurements provided."
diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py
index 9d32e0efff7..9bce5b39a37 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/common.py
@@ -21,22 +21,20 @@
 
 import torch
 
-from neural_compressor.torch.algorithms.fp8_quant.prepare_quant.prepare_model import finish_measurements
 from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import Fp8cfg
+from neural_compressor.torch.algorithms.fp8_quant.prepare_quant.prepare_model import finish_measurements
 
 
 def save_calib_result(model):
-    if (hasattr(model, "__hqt_config__") and
-            isinstance(model.__hqt_config__, Fp8cfg)):
+    if hasattr(model, "__hqt_config__") and isinstance(model.__hqt_config__, Fp8cfg):
         # TODO SW-184714 modify hqt notation to inc notation once code is ported
         finish_measurements(model)
     else:
         raise NotImplementedError("Saving calibration results currently supported only in HPU.")
 
 
-
 def update_mode(config_path, measure_step=False, quant_step=False):
-    with open(config_path, 'r') as file:
+    with open(config_path, "r") as file:
         config = json.load(file)
 
     if (measure_step and config.get("mode") == "MEASURE") or (quant_step and config.get("mode") == "QUANTIZE"):
@@ -50,7 +48,7 @@ def update_mode(config_path, measure_step=False, quant_step=False):
         temp_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
         temp_file_path = temp_file.name
 
-        with open(temp_file_path, 'w') as temp_file:
+        with open(temp_file_path, "w") as temp_file:
             json.dump(config, temp_file)
 
         return temp_file_path
@@ -80,6 +78,7 @@ def get_patched_mod_list():
 
 def restore_patched_module(patched_model):
     from neural_compressor.torch.algorithms.fp8_quant.helper_modules import helper_mods
+
     patched_mod_list = get_patched_mod_list()
 
     parent_child_mod_dict = generate_model_info(patched_model)
@@ -89,8 +88,9 @@ def restore_patched_module(patched_model):
             if patched_mod_type_str in patched_mod_list:
                 parent = parent_child_mod_dict[patched_mod].parent
                 name = parent_child_mod_dict[patched_mod].name
-                class_name_org = getattr(patched_mod, "class_name_org", None) or \
-                    patched_mod.__class__.__name__.split("Patched")[-1]
+                class_name_org = (
+                    getattr(patched_mod, "class_name_org", None) or patched_mod.__class__.__name__.split("Patched")[-1]
+                )
                 origin_mod = helper_mods[class_name_org](patched_mod)
                 origin_mod.forward = patched_mod.forward_orig
                 setattr(parent, name, origin_mod)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
index 3d4fe19afb2..f160f208612 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py
@@ -17,11 +17,11 @@
 from neural_compressor.common.utils import FP8_QUANT
 from neural_compressor.torch.algorithms import Quantizer
 from neural_compressor.torch.algorithms.fp8_quant import (
+    finish_measurements,
+    prep_model,
     restore_patched_module,
     update_mode,
     with_patched_module,
-    prep_model,
-    finish_measurements,
 )
 
 
@@ -38,8 +38,8 @@ def prepare(self, model):
         return model
 
     def convert(self, model):
-        if with_patched_module(model): # if model was calibrated on hpu
-            finish_measurements(model) # dump the measurements into files to be loaded in _convert
+        if with_patched_module(model):  # if model was calibrated on hpu
+            finish_measurements(model)  # dump the measurements into files to be loaded in _convert
             # for INC flow, it calls `prepare` and then `convert` user-facing API in one run
             restore_patched_module(model)
         _convert(model, self.quant_config)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
index 6c7154328d7..a31d4910979 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py
@@ -18,12 +18,15 @@
 
 helper_mods = {}
 
+
 def helper_mod_register(name):
     def decorator(mod):
         helper_mods[name] = mod
         return mod
+
     return decorator
 
+
 @helper_mod_register(name="Matmul")
 class Matmul(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -31,6 +34,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="Linear")
 class Linear(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -38,6 +42,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="FalconLinear")
 class FalconLinear(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -45,6 +50,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="KVCache")
 class KVCache(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -54,6 +60,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.forward = patched_mod.forward
         self.update = patched_mod.update
 
+
 @helper_mod_register(name="Conv2d")
 class Conv2d(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -61,6 +68,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="LoRACompatibleLinear")
 class LoRACompatibleLinear(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -68,6 +76,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="LoRACompatibleConv")
 class LoRACompatibleConv(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -75,6 +84,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="Softmax")
 class Softmax(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -82,6 +92,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="LinearLayer")
 class LinearLayer(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -89,6 +100,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="LinearAllreduce")
 class LinearAllreduce(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -96,6 +108,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="ScopedLinearAllReduce")
 class ScopedLinearAllReduce(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -103,6 +116,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="LmHeadLinearAllreduce")
 class LmHeadLinearAllreduce(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
@@ -110,6 +124,7 @@ def __init__(self, patched_mod, *args, **kwargs):
         self.__dict__.update(patched_mod.__dict__)
         self.extra_repr = patched_mod.extra_repr_org
 
+
 @helper_mod_register(name="ModuleFusedSDPA")
 class ModuleFusedSDPA(torch.nn.Module):
     def __init__(self, patched_mod, *args, **kwargs):
diff --git a/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/__init__.py
index e69de29bb2d..28f108cb636 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py
index 8a38f79388b..a1b3589d27d 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py
@@ -1,13 +1,24 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Optional
-from .._quant_common.quant_config import Fp8cfg
+
 from .._core.measure import save_measurements
 from .._core.utils import prepare_model
-from .._quant_common.quant_config import (
-    _read_config_from_file,
-    Fp8cfg,
-    set_hqt_config,
-)
+from .._quant_common.quant_config import Fp8cfg, _read_config_from_file, set_hqt_config
+
 
 def _prep_model_with_predefined_config(model, *, config: Fp8cfg):
     set_hqt_config(model, config)
@@ -15,8 +26,8 @@ def _prep_model_with_predefined_config(model, *, config: Fp8cfg):
 
 
 def prep_model(model, config_path: Optional[str] = None):
-    """
-    Prepare this model with the given (absolute or relative) path of the json file containing the configuration.
+    """Prepare this model with the given (absolute or relative) path of the json file containing the configuration.
+
     If `config_path` is not given or `None`,
     instead perform the legacy behavior of checking for env variable `QUANT_CONFIG`.
     """
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/scripts/__init__.py
index e69de29bb2d..28f108cb636 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/scripts/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/scripts/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/__init__.py
index e69de29bb2d..28f108cb636 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
index 3e09123a599..22d27f7a8e7 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/scripts/regression_detection/regression_detection.py
@@ -1,7 +1,22 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
+import json
+
 import numpy as np
 import scipy
-import json
 
 tasks = ["winogrande", "hellaswag", "piqa", "lambada_openai"]
 
@@ -83,7 +98,7 @@ def ztest(ref_mean=0.0, ref_stderr=1.0, test_mean=0.0, test_stderr=0.0):
             regressions.append(f"Z-Test high precision p-value is less than 0.05 in {task} task.")
 
         # The two-sample z-test comparing the golden and under-test low-precision configuration
-        if mode != None:
+        if mode is not None:
             ref_mean_lp = golden_metrics_json[lp_dtype][mode][task]["mean"]
             ref_stderr_lp = golden_metrics_json[lp_dtype][mode][task]["sem"]
         else:
@@ -97,7 +112,7 @@ def ztest(ref_mean=0.0, ref_stderr=1.0, test_mean=0.0, test_stderr=0.0):
             regressions.append(f"Z-Test low precision p-value is less than 0.05 in {task} task.")
 
         # The single-sample z-test comparing the golden and under-test degradation of low-precision configuration
-        if mode != None:
+        if mode is not None:
             ref_mean_diff = golden_metrics_json[lp_dtype][mode][task]["mean_diff"]
             ref_stderr_diff = golden_metrics_json[lp_dtype][mode][task]["sem_diff"]
         else:
diff --git a/neural_compressor/torch/algorithms/fp8_quant/utils/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/utils/__init__.py
index e69de29bb2d..28f108cb636 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/utils/__init__.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/utils/logger.py b/neural_compressor/torch/algorithms/fp8_quant/utils/logger.py
index b4724fe31eb..432a15008c5 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/utils/logger.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/utils/logger.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Taken and adjusted from neural-compressor-fork/neural_compressor/common/utils/logger.py
 # Should be merged with INC logger once HQT code is inserted into INC
 # TODO: SW-185347 merge INC logger with HQT logger
@@ -5,8 +19,8 @@
 
 
 import logging
-from logging.handlers import RotatingFileHandler
 import os
+from logging.handlers import RotatingFileHandler
 
 __all__ = ["logger"]
 
@@ -44,8 +58,7 @@ def _pretty_dict(value, indent=0):
 
 
 def trace(self, msg, *args, **kwargs):
-    """
-    Log 'msg % args' with severity 'TRACE'.
+    """Log 'msg % args' with severity 'TRACE'.
 
     To pass exception information, use the keyword argument exc_info with
     a true value, e.g.
@@ -91,7 +104,7 @@ def __new__(cls):
     def get_enable_console_val(self):
         enableConsole = os.environ.get("ENABLE_CONSOLE", "False").upper()
         if enableConsole not in ["TRUE", "FALSE"]:
-            raise Exception(f"Env var 'ENABLE_CONSOLE' has to be true or false.")
+            raise Exception("Env var 'ENABLE_CONSOLE' has to be true or false.")
         return enableConsole == "TRUE"
 
     def get_log_level(self):
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 3219be7643c..5b446b71ca0 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -18,8 +18,8 @@
 """Intel Neural Compressor Pytorch quantization config API."""
 
 
-import json
 import importlib
+import json
 from collections import OrderedDict
 from typing import Callable, Dict, List, NamedTuple, Optional
 from typing import OrderedDict as OrderedDictType
@@ -1611,11 +1611,19 @@ def get_default_hqq_config() -> HQQConfig:
 ######################## FP8 Quant Config ###############################
 # refer to habana_quantization_toolkit/_core/common.py
 FP8_WHITE_LIST = [
-        "Matmul", "Linear", "FalconLinear", "KVCache", "Conv2d",
-        "LoRACompatibleLinear", "LoRACompatibleConv", "Softmax", "ModuleFusedSDPA"]
+    "Matmul",
+    "Linear",
+    "FalconLinear",
+    "KVCache",
+    "Conv2d",
+    "LoRACompatibleLinear",
+    "LoRACompatibleConv",
+    "Softmax",
+    "ModuleFusedSDPA",
+]
 if importlib.util.find_spec("deepspeed"):
-    FP8_WHITE_LIST.extend(
-        ["LinearLayer", "LinearAllreduce","ScopedLinearAllReduce", "LmHeadLinearAllreduce"])
+    FP8_WHITE_LIST.extend(["LinearLayer", "LinearAllreduce", "ScopedLinearAllReduce", "LmHeadLinearAllreduce"])
+
 
 @register_config(framework_name=FRAMEWORK_NAME, algo_name=FP8_QUANT)
 class FP8Config(TorchBaseConfig):
@@ -1636,8 +1644,8 @@ def __init__(
         dump_stats_path: str = "./hqt_output/measure",
         fp8_config: str = "E4M3",
         hp_dtype: str = "bf16",
-        blocklist: dict = {'names': [], 'types': ()},
-        allowlist: dict = {'names': [], 'types': FP8_WHITE_LIST},
+        blocklist: dict = {"names": [], "types": ()},
+        allowlist: dict = {"names": [], "types": FP8_WHITE_LIST},
         mode: str = "AUTO",
         scale_method: str = "maxabs_hw",
         scale_params: dict = {},
@@ -1648,7 +1656,7 @@ def __init__(
     ):
         """Init FP8 config."""
         super().__init__()
-        self.dump_stats_path =dump_stats_path
+        self.dump_stats_path = dump_stats_path
         self.fp8_config = fp8_config
         self.hp_dtype = hp_dtype
         self.blocklist = blocklist
@@ -1697,9 +1705,8 @@ def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]
         # just a simple example here
         # usually write parameter combinations that are more suitable to tune based on experience.
         return FP8Config(
-            fp8_config=["E4M3", "E5M2"],
-            scale_method=["without_scale", "maxabs_hw"],
-            measure_exclude=["NONE", "OUTPUT"])
+            fp8_config=["E4M3", "E5M2"], scale_method=["without_scale", "maxabs_hw"], measure_exclude=["NONE", "OUTPUT"]
+        )
 
     @classmethod
     def register_supported_configs(cls):
@@ -1708,15 +1715,22 @@ def register_supported_configs(cls):
         linear_rtn_config = FP8Config(
             mode=["AUTO", "MEASURE", "QUANTIZE"],
             fp8_config=["E4M3", "E5M2"],
-            scale_method=["without_scale", "unit_scale", "max", "maxabs_hw",
-                "maxabs_pow2", "maxabs_hw_opt_weight", "maxabs_pow2_opt_weight",
+            scale_method=[
+                "without_scale",
+                "unit_scale",
+                "max",
+                "maxabs_hw",
+                "maxabs_pow2",
+                "maxabs_hw_opt_weight",
+                "maxabs_pow2_opt_weight",
                 "smoothquant_weights_output_channel_maxabs_pow2",
                 "weaksmoothquant_weights_output_channel_maxabs_pow2",
                 "act_maxabs_hw_weights_pcs_maxabs_pow2",
                 "act_maxabs_hw_weights_pcs_opt_pow2",
                 "act_maxabs_pow2_weights_pcs_maxabs_pow2",
                 "act_maxabs_pow2_weights_pcs_opt_pow2",
-                "smoothquant_opt"],
+                "smoothquant_opt",
+            ],
             observer=["shape", "maxabs", "maxabs_per_channel", "save"],
             measure_exclude=["NONE", "OUTPUT", "INPUT", "ALL"],
         )
@@ -1728,16 +1742,16 @@ def register_supported_configs(cls):
     def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
         filter_result = []
         for op_name, module in model.named_modules():
-            if module.__class__.__name__ in FP8_WHITE_LIST or \
-            module.__class__.__name__.split("Patched")[-1] in FP8_WHITE_LIST:
+            if (
+                module.__class__.__name__ in FP8_WHITE_LIST
+                or module.__class__.__name__.split("Patched")[-1] in FP8_WHITE_LIST
+            ):
                 pair = (op_name, module.__class__.__name__)
                 filter_result.append(pair)
         logger.debug(f"Get model info: {filter_result}")
         return filter_result
 
-    def to_config_mapping(
-        self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None
-    ):
+    def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None):
         if self.json_file is None:
             self.save_temp_json_file()
         config_mapping = OrderedDict()
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 5c161e5bb8b..5b69e2a3830 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -20,7 +20,7 @@
 
 from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry
 from neural_compressor.common.utils import Mode, call_counter, log_process
-from neural_compressor.torch.quantization.config import SmoothQuantConfig, StaticQuantConfig, FP8Config
+from neural_compressor.torch.quantization.config import FP8Config, SmoothQuantConfig, StaticQuantConfig
 from neural_compressor.torch.utils import is_ipex_available, logger
 from neural_compressor.torch.utils.utility import WHITE_MODULE_LIST, algos_mapping, get_model_info
 
@@ -224,5 +224,6 @@ def convert(
 
 
 def finalize_calibration(model):
-        from neural_compressor.torch.algorithms.fp8_quant import save_calib_result
-        save_calib_result(model)
+    from neural_compressor.torch.algorithms.fp8_quant import save_calib_result
+
+    save_calib_result(model)
diff --git a/test/3x/torch/algorithms/fp8_quant/fp8_tests.py b/test/3x/torch/algorithms/fp8_quant/fp8_tests.py
index adb9e426409..0e2e5820f93 100644
--- a/test/3x/torch/algorithms/fp8_quant/fp8_tests.py
+++ b/test/3x/torch/algorithms/fp8_quant/fp8_tests.py
@@ -1,7 +1,6 @@
-import torch
-
-import habana_quantization_toolkit
 import habana_frameworks.torch.core as htcore
+import habana_quantization_toolkit
+import torch
 
 # This file is for small tests run for debug flow and accuracy. (Not for CI)
 
diff --git a/test/3x/torch/algorithms/fp8_quant/tester.py b/test/3x/torch/algorithms/fp8_quant/tester.py
index 374c9ada590..a8b52e7b7cb 100644
--- a/test/3x/torch/algorithms/fp8_quant/tester.py
+++ b/test/3x/torch/algorithms/fp8_quant/tester.py
@@ -7,17 +7,10 @@
 import typing
 from dataclasses import dataclass
 
-import torch
-
 import habana_frameworks as htcore
-
+import torch
 from habana_quantization_toolkit._core.common import mod_default_dict
-
-from habana_quantization_toolkit._quant_common.quant_config import (
-    Fp8cfg,
-    QuantMode,
-    ScaleMethod,
-)
+from habana_quantization_toolkit._quant_common.quant_config import Fp8cfg, QuantMode, ScaleMethod
 
 
 @dataclass
@@ -34,8 +27,8 @@ class TestVector:
 
 
 def _assert_quantized_correctly(*, reference_model: WrapModel, quantized_model: WrapModel):
-    """
-    In quantized mode, assert the reference model is not quantized, and the quantized model is.
+    """In quantized mode, assert the reference model is not quantized, and the quantized model is.
+
     Otherwise, assert that both are not quantized.
     """
     for reference_name in mod_default_dict.keys():
@@ -59,8 +52,7 @@ def run_accuracy_test(
     test_vectors: typing.Iterable[TestVector],
     seed: typing.Optional[int] = None,
 ):
-    """
-    Run both the reference and the quantized versions of this module,
+    """Run both the reference and the quantized versions of this module,
     and compare the outputs on every test vector.
 
     First the measure vectors are used for measurements.
@@ -132,8 +124,7 @@ def run_accuracy_test(
 
 
 def _set_optional_seed(*, module_class: typing.Type[M], seed: typing.Optional[int]):
-    """
-    Set random seed to a unique reproducible value derived from the module.
+    """Set random seed to a unique reproducible value derived from the module.
 
     Args:
         module_class: The module class to test.
@@ -156,8 +147,7 @@ def _set_optional_seed(*, module_class: typing.Type[M], seed: typing.Optional[in
 
 
 class WrapModel(torch.nn.Module):
-    """
-    Wrap an inner module.
+    """Wrap an inner module.
     If we do not wrap the inner module, it will not be quantized properly.
 
     Maybe we can change this behavior in the future.
@@ -182,7 +172,7 @@ def has_name(self, module_name: str) -> bool:
         return any(module._get_name() == module_name for module in self.modules())
 
 
-TEST_ONLY_OUTPUT_DIRECTORY = f"habana_quantization_toolkit/tests/output/"
+TEST_ONLY_OUTPUT_DIRECTORY = "habana_quantization_toolkit/tests/output/"
 
 
 def get_test_unique_dump_path():
@@ -198,8 +188,7 @@ def _get_test_only_config(
     scale_method: ScaleMethod,
     lp_dtype: torch.dtype,
 ) -> Fp8cfg:
-    """
-    Should NOT be used externally.
+    """Should NOT be used externally.
 
     Return a new config used only for the tests.
     """
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py
index 2516c4e1ef6..8bc8b9ced17 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/__init__.py
@@ -1,5 +1,4 @@
-"""
-The unit_test package contains a `test_<module>.py` file for every module supported
+"""The unit_test package contains a `test_<module>.py` file for every module supported
 in the habana quantization toolkit.
 
 To run use `pytest`.
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
index 26939a5b5bf..962dde1dc0a 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_deepspeed.py
@@ -2,10 +2,8 @@
 
 import pytest
 import torch
-
 from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
-
-from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+from habana_quantization_toolkit.tests import TestVector, run_accuracy_test
 
 
 class LinearBlock(torch.nn.Module):
@@ -51,6 +49,7 @@ def __init__(self, **kwargs):
 
         # Initialize deepspeed on model creation
         import deepspeed
+
         block = deepspeed.init_inference(
             block,
             injection_policy=injection_policy,
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
index 502aaeb457d..1c3ca8f07ad 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py
@@ -1,13 +1,9 @@
-"""
-Use this module as an example of how to write new unit tests for layers.
-"""
-
-import torch
+"""Use this module as an example of how to write new unit tests for layers."""
 
 import habana_quantization_toolkit as hqt
-
-from habana_quantization_toolkit._quant_common.quant_config import QuantMode
+import torch
 from habana_quantization_toolkit._quant_common.helper_modules import Matmul
+from habana_quantization_toolkit._quant_common.quant_config import QuantMode
 
 
 class Model(torch.nn.Module):
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py
index 97151d54cd8..e163376ba81 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_matmul_fp8.py
@@ -1,11 +1,11 @@
 import itertools
+from typing import Iterable, Tuple
+
+import habana_frameworks.torch.utils.experimental as htexp
 import pytest
 import torch
-
-from typing import Iterable, Tuple
 from habana_quantization_toolkit._core.fp_utils import FP8_143_SCALES
 from habana_quantization_toolkit._quant_common.helper_modules import matmul_fp8
-import habana_frameworks.torch.utils.experimental as htexp
 
 
 def run_test_matmul_fp8(
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
index 6994bf437ca..67efdb8f247 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py
@@ -3,8 +3,7 @@
 import pytest
 import torch
 from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
-
-from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+from habana_quantization_toolkit.tests import TestVector, run_accuracy_test
 
 
 def get_test_vectors(*, dtype: torch.dtype, C_in: int, H: int, W: int) -> typing.Iterable[TestVector]:
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
index 528b5d9358d..0beb2c2e779 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py
@@ -3,8 +3,7 @@
 import pytest
 import torch
 from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
-
-from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+from habana_quantization_toolkit.tests import TestVector, run_accuracy_test
 
 
 def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int) -> typing.Iterable[TestVector]:
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py
index 86ae332b311..11bdc1ee07a 100644
--- a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py
+++ b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py
@@ -3,8 +3,7 @@
 import pytest
 import torch
 from habana_quantization_toolkit._quant_common.quant_config import ScaleMethod
-
-from habana_quantization_toolkit.tests import run_accuracy_test, TestVector
+from habana_quantization_toolkit.tests import TestVector, run_accuracy_test
 
 
 def get_test_vectors(*, dtype: torch.dtype) -> typing.Iterable[TestVector]:
@@ -32,8 +31,8 @@ def get_test_vectors(*, dtype: torch.dtype) -> typing.Iterable[TestVector]:
 
 
 class Matmul(torch.nn.Module):
-    """
-    This is a mimic of other implementations of `Matmul`.
+    """This is a mimic of other implementations of `Matmul`.
+
     It is here to not create a dependency on optimum-habana (which is logically needed).
     It should not be used directly in user code.
     """

From 853bb8db233a586b1ba6bc97e948f79e2d8e63cf Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Fri, 26 Jul 2024 17:21:56 +0800
Subject: [PATCH 21/51] update docstring

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/torch/algorithms/weight_only/modules.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index d8b5910cb59..3a6adbba274 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -315,6 +315,7 @@ def pack(self, int_weight, scales, zp, bias=None, g_idx=None, **kwargs):
             self.qzeros = self.qzeros.T.contiguous()
 
     def unpack(self):
+        """Unpack weight and zero point."""
         scales = self.scales.T.contiguous() if self.use_optimum_format else self.scales
         qweight = self.qweight.T.contiguous() if self.use_optimum_format else self.qweight
 
@@ -354,6 +355,7 @@ def unpack(self):
         return UnpackedWeightOnlyLinearParams(weight, scales, zp, g_idx=self.g_idx, bias=self.bias)
 
     def recover(self):
+        """Recover fp32 weight from packed weight."""
         logger.debug(f"Recovering {self} weight")
         unpack_params_dict = self.unpack()
         weight = unpack_params_dict.get("int_weight")
@@ -379,6 +381,7 @@ def recover(self):
         return fp32_weight.to(scales.device)
 
     def forward(self, input):
+        """Forward function."""
         if not hasattr(self, "weight"):
             weight = self.recover()
             device = self.scales.device
@@ -396,12 +399,14 @@ def forward(self, input):
             return F.linear(input, weight, self.bias)
 
     def pack_tensor(self, raw_tensor):
+        """Pack tensor."""
         if "cuda" in self.device:
             return self.pack_tensor_with_torch(raw_tensor)
         else:
             return self.pack_tensor_with_numpy(raw_tensor)
 
     def unpack_tensor(self, packed_tensor):
+        """Unpack tensor."""
         if "cuda" in self.device:
             return self.unpack_tensor_with_torch(packed_tensor)
         else:

From 86d8dfaa7e11f53a2fe94cc8ab37c3b98ff2949f Mon Sep 17 00:00:00 2001
From: xinhe <xin3.he@intel.com>
Date: Mon, 29 Jul 2024 13:53:13 +0800
Subject: [PATCH 22/51] add fp8 example and document (#1639)

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 README.md                                     |  78 ++--
 docs/3x/PT_FP8Quant.md                        | 113 +++++
 examples/.config/model_params_pytorch_3x.json |   7 +
 .../3.x_api/pytorch/cv/fp8_quant/README.md    |  28 ++
 .../pytorch/cv/fp8_quant/extract_ILSVRC.sh    |  80 ++++
 examples/3.x_api/pytorch/cv/fp8_quant/main.py | 391 ++++++++++++++++++
 .../pytorch/cv/fp8_quant/requirements.txt     |   3 +
 .../3.x_api/pytorch/cv/fp8_quant/run_quant.sh |  53 +++
 neural_compressor/common/base_config.py       |   6 +-
 .../torch/algorithms/fp8_quant/common.py      |   2 +-
 .../fp8_quant/test_fp8_static_quant.py        |  66 ++-
 test/3x/torch/requirements.txt                |   1 +
 12 files changed, 775 insertions(+), 53 deletions(-)
 create mode 100644 docs/3x/PT_FP8Quant.md
 create mode 100644 examples/3.x_api/pytorch/cv/fp8_quant/README.md
 create mode 100644 examples/3.x_api/pytorch/cv/fp8_quant/extract_ILSVRC.sh
 create mode 100644 examples/3.x_api/pytorch/cv/fp8_quant/main.py
 create mode 100644 examples/3.x_api/pytorch/cv/fp8_quant/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/cv/fp8_quant/run_quant.sh

diff --git a/README.md b/README.md
index 5fbad1d0434..e7ed35bf5cc 100644
--- a/README.md
+++ b/README.md
@@ -68,67 +68,52 @@ pip install "neural-compressor>=2.3" "transformers>=4.34.0" torch torchvision
 ```
 After successfully installing these packages, try your first quantization program.
 
-### Weight-Only Quantization (LLMs)
-Following example code demonstrates Weight-Only Quantization on LLMs, it supports Intel CPU, Intel Gaudi2 AI Accelerator, Nvidia GPU, best device will be selected automatically.
+### [FP8 Quantization](./examples/3.x_api/pytorch/cv/fp8_quant/)
+Following example code demonstrates FP8 Quantization, it is supported by Intel Gaudi2 AI Accelerator. 
 
 To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
 ```bash
 # Run a container with an interactive shell
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
-
-# Install the optimum-habana
-pip install --upgrade-strategy eager optimum[habana]
-
-# Install INC/auto_round
-pip install neural-compressor auto_round
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
 ```
 Run the example:
 ```python
-from transformers import AutoModel, AutoTokenizer
-
-from neural_compressor.config import PostTrainingQuantConfig
-from neural_compressor.quantization import fit
-from neural_compressor.adaptor.torch_utils.auto_round import get_dataloader
-
-model_name = "EleutherAI/gpt-neo-125m"
-float_model = AutoModel.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-dataloader = get_dataloader(tokenizer, seqlen=2048)
-
-woq_conf = PostTrainingQuantConfig(
-    approach="weight_only",
-    op_type_dict={
-        ".*": {  # match all ops
-            "weight": {
-                "dtype": "int",
-                "bits": 4,
-                "algorithm": "AUTOROUND",
-            },
-        }
-    },
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    prepare,
+    convert,
 )
-quantized_model = fit(model=float_model, conf=woq_conf, calib_dataloader=dataloader)
+import torchvision.models as models
+
+model = models.resnet18()
+qconfig = FP8Config(fp8_config="E4M3")
+model = prepare(model, qconfig)
+# customer defined calibration
+calib_func(model)
+model = convert(model)
 ```
-**Note:**
 
-To try INT4 model inference, please directly use [Intel Extension for Transformers](https://github.com/intel/intel-extension-for-transformers), which leverages Intel Neural Compressor for model quantization.
+### [Weight-Only Quantization (LLMs)](./examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/)
 
-### Static Quantization (Non-LLMs)
+Following example code demonstrates Weight-Only Quantization on LLMs, it supports Intel CPU, Intel Gaudi2 AI Accelerator, Nvidia GPU, best device will be selected automatically. 
 
 ```python
-from torchvision import models
+from neural_compressor.torch.quantization import prepare, convert, AutoRoundConfig
 
-from neural_compressor.config import PostTrainingQuantConfig
-from neural_compressor.data import DataLoader, Datasets
-from neural_compressor.quantization import fit
+model_name = "EleutherAI/gpt-neo-125m"
+model = AutoModel.from_pretrained(model_name)
 
-float_model = models.resnet18()
-dataset = Datasets("pytorch")["dummy"](shape=(1, 3, 224, 224))
-calib_dataloader = DataLoader(framework="pytorch", dataset=dataset)
-static_quant_conf = PostTrainingQuantConfig()
-quantized_model = fit(model=float_model, conf=static_quant_conf, calib_dataloader=calib_dataloader)
+quant_config = AutoRoundConfig()
+model = prepare(model, quant_config)
+# customer defined calibration
+run_fn(model)  # calibration
+model = convert(model)
 ```
 
+**Note:**
+
+To try INT4 model inference, please directly use [Intel Extension for Transformers](https://github.com/intel/intel-extension-for-transformers), which leverages Intel Neural Compressor for model quantization.
+
 ## Documentation
 
 <table class="docutils">
@@ -154,12 +139,13 @@ quantized_model = fit(model=float_model, conf=static_quant_conf, calib_dataloade
   <tbody>
     <tr>
         <td colspan="2" align="center"><a href="./docs/source/3x/PyTorch.md">Overview</a></td>
-        <td colspan="2" align="center"><a href="./docs/source/3x/PT_StaticQuant.md">Static Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_DynamicQuant.md">Dynamic Quantization</a></td>
+        <td colspan="2" align="center"><a href="./docs/source/3x/PT_StaticQuant.md">Static Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_SmoothQuant.md">Smooth Quantization</a></td>
     </tr>
     <tr>
-        <td colspan="4" align="center"><a href="./docs/source/3x/PT_WeightOnlyQuant.md">Weight-Only Quantization</a></td>
+        <td colspan="2" align="center"><a href="./docs/source/3x/PT_WeightOnlyQuant.md">Weight-Only Quantization</a></td>
+        <td colspan="2" align="center"><a href="./docs/3x/PT_FP8Quant.md">FP8 Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_MXQuant.md">MX Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_MixedPrecision.md">Mixed Precision</a></td>
     </tr>
diff --git a/docs/3x/PT_FP8Quant.md b/docs/3x/PT_FP8Quant.md
new file mode 100644
index 00000000000..a0ed3352e8e
--- /dev/null
+++ b/docs/3x/PT_FP8Quant.md
@@ -0,0 +1,113 @@
+FP8 Quantization
+=======
+
+1. [Introduction](#introduction)
+2. [Supported Parameters](#supported-parameters)
+3. [Get Start with FP8 Quantization](#get-start-with-fp8-quantization)
+4. [Examples](#examples)  
+
+## Introduction
+
+Float point 8 (FP8) is a promising data type for low precision quantization which provides a data distribution that is completely different from INT8 and it's shown as below.
+
+<div align="center">
+    <img src="./imgs/fp8_dtype.png" height="250"/>
+</div>
+
+Intel Gaudi2, also known as HPU, provides this data type capability for low precision quantization, which includes `E4M3` and `E5M2`. For more information about these two data type, please refer to [link](https://arxiv.org/abs/2209.05433).
+
+Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 capability. with simple  with lower memory usage and lower compute cost, 8 bit model
+
+## Supported Parameters
+
+<style type="text/css">
+.tg  {border-collapse:collapse;border-spacing:0;}
+.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
+  overflow:hidden;padding:10px 5px;word-break:normal;}
+.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
+  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
+.tg .tg-fymr{border-color:inherit;font-weight:bold;text-align:left;vertical-align:top}
+.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
+</style>
+<table class="tg"><thead>
+  <tr>
+    <th class="tg-fymr">Attribute</th>
+    <th class="tg-fymr">Description</th>
+    <th class="tg-fymr">Values</th>
+  </tr></thead>
+<tbody>
+  <tr>
+    <td class="tg-0pky">fp8_config</td>
+    <td class="tg-0pky">The target data type of FP8 quantization.</td>
+    <td class="tg-0pky">E4M3 (default) - As Fig. 2<br>E5M2 - As Fig. 1.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">hp_dtype</td>
+    <td class="tg-0pky">The high precision data type of non-FP8 operators.</td>
+    <td class="tg-0pky">bf16 (default) - torch.bfloat16<br>fp16 - torch.float16.<br>fp32 - torch.float32.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">observer</td>
+    <td class="tg-0pky">The observer to measure the statistics.</td>
+    <td class="tg-0pky">maxabs (default), saves all tensors to files.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">allowlist</td>
+    <td class="tg-0pky">List of nn.Module names or types to quantize. When setting an empty list, all the supported modules will be quantized by default. See Supported Modules. Not setting the list at all is not recommended as it will set the allowlist to these modules only: torch.nn.Linear, torch.nn.Conv2d, and BMM.</td>
+    <td class="tg-0pky">Default = {'names': [], 'types': <span title=["Matmul","Linear","FalconLinear","KVCache","Conv2d","LoRACompatibleLinear","LoRACompatibleConv","Softmax","ModuleFusedSDPA","LinearLayer","LinearAllreduce","ScopedLinearAllReduce","LmHeadLinearAllreduce"]>FP8_WHITE_LIST}</span></td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">blocklist</td>
+    <td class="tg-0pky">List of nn.Module names or types not to quantize. Defaults to empty list, so you may omit it from the config file.</td>
+    <td class="tg-0pky">Default = {'names': [], 'types': ()}</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">mode</td>
+    <td class="tg-0pky">The mode, measure or quantize, to run HQT with.</td>
+    <td class="tg-0pky">MEASURE - Measure statistics of all modules and emit the results to dump_stats_path.<br>QUANTIZE - Quantize and run the model according to the provided measurements.<br>AUTO (default) - Select from [MEASURE, QUANTIZE] automatically.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">dump_stats_path</td>
+    <td class="tg-0pky">The path to save and load the measurements. The path is created up until the level before last "/". The string after the last / will be used as prefix to all the measurement files that will be created.</td>
+    <td class="tg-0pky">Default = "./hqt_output/measure"</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">scale_method</td>
+    <td class="tg-0pky">The method for calculating the scale from the measurement.</td>
+    <td class="tg-0pky">- without_scale - Convert to/from FP8 without scaling.<br>- unit_scale - Always use scale of 1.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">measure_exclude</td>
+    <td class="tg-0pky">If this attribute is not defined, the default is OUTPUT. Since most models do not require measuring output tensors, you can exclude it to speed up the measurement process.</td>
+    <td class="tg-0pky">NONE - All tensors are measured.<br>OUTPUT (default) - Excludes measurement of output tensors.</td>
+  </tr>
+</tbody></table>
+
+## Get Start with FP8 Quantization
+
+### Demo Usage
+
+```python
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    prepare,
+    convert,
+)
+import torchvision.models as models
+
+model = models.resnet18()
+qconfig = FP8Config(fp8_config="E4M3")
+model = prepare(model, qconfig)
+# customer defined calibration
+calib_func(model)
+model = convert(model)
+```
+
+## Examples
+
+| Task                 | Example |
+|----------------------|---------|
+| Computer Vision (CV)      |    [Link](../../examples/3.x_api/pytorch/cv/fp8_quant/)     |
+| Large Language Model (LLM) |    [Link](https://github.com/HabanaAI/optimum-habana-fork/tree/habana-main/examples/text-generation#running-with-fp8)     |
+
+> Note: For LLM, Optimum-habana provides higher performance based on modified modeling files, so here the Link of LLM goes to Optimum-habana, which utilize Intel Neural Compressor for FP8 quantization internally.
diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index e38749e2ef6..7b526005223 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -140,6 +140,13 @@
         "main_script": "main.py",
         "batch_size": 1
       },
+      "resnet18_fp8_static":{
+        "model_src_dir": "cv/fp8_quant",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
       "opt_125m_pt2e_static":{
         "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
         "dataset_location": "",
diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/README.md b/examples/3.x_api/pytorch/cv/fp8_quant/README.md
new file mode 100644
index 00000000000..ebad25f9f05
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/fp8_quant/README.md
@@ -0,0 +1,28 @@
+# ImageNet FP8 Quantization
+
+This implements FP8 quantization of popular model architectures, such as ResNet on the ImageNet dataset, which is supported by Intel Gaudi2 AI Accelerator.
+
+## Requirements
+
+To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built). 
+```bash
+# Run a container with an interactive shell
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+```
+
+- Install requirements
+- `pip install -r requirements.txt`
+- Download the ImageNet dataset from http://www.image-net.org/
+  - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh)
+
+## Quantizaiton
+
+To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset:
+
+```bash
+python main.py --pretrained -t -a resnet50 -b 30 /path/to/imagenet
+```
+or
+```bash
+bash run_quant.sh --input_model=resnet50 --dataset_location=/path/to/imagenet
+```
diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/fp8_quant/extract_ILSVRC.sh
new file mode 100644
index 00000000000..3ec05e8f328
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/fp8_quant/extract_ILSVRC.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#
+# script to extract ImageNet dataset
+# ILSVRC2012_img_train.tar (about 138 GB)
+# ILSVRC2012_img_val.tar (about 6.3 GB)
+# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory
+#
+#  Adapted from:
+#  https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md
+#  https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
+# 
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Make imagnet directory
+#
+mkdir imagenet
+#
+# Extract the training data:
+#
+# Create train directory; move .tar file; change directory
+mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train
+# Extract training set; remove compressed file
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+#
+# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
+#
+# For each .tar file: 
+#   1. create directory with same name as .tar file
+#   2. extract and copy contents of .tar file into directory
+#   3. remove .tar file
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+#
+# This results in a training directory like so:
+#
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+# Change back to original directory
+cd ../..
+#
+# Extract the validation data and move images to subfolders:
+#
+# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file
+mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar
+# get script from soumith and run; this script creates all class directories and moves images into corresponding directories
+wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+#
+# This results in a validation directory like so:
+#
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Check total files after extract
+#
+#  $ find train/ -name "*.JPEG" | wc -l
+#  1281167
+#  $ find val/ -name "*.JPEG" | wc -l
+#  50000
+#
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/main.py b/examples/3.x_api/pytorch/cv/fp8_quant/main.py
new file mode 100644
index 00000000000..dfa7515343c
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/fp8_quant/main.py
@@ -0,0 +1,391 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.distributed as dist
+import torch.optim
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    prepare,
+    convert,
+)
+import habana_frameworks.torch.core as htcore
+
+
+model_names = models.list_models(module=models)
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('-t', '--tune', dest='tune', action='store_true',
+                    help='tune best int8 model on calibration dataset')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--ppn', default=1, type=int,
+                    help='number of processes on each node of distributed training')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument("--calib_iters", default=10, type=int,
+                    help="For calibration only.")
+parser.add_argument('-i', "--iter", default=0, type=int,
+                    help='For accuracy measurement only.')
+parser.add_argument('-w', "--warmup_iter", default=5, type=int,
+                    help='For benchmark measurement only.')
+parser.add_argument('--performance', dest='performance', action='store_true',
+                    help='run benchmark')
+parser.add_argument('-r', "--accuracy", dest='accuracy', action='store_true',
+                    help='For accuracy measurement only.')
+parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH',
+                    help='path to checkpoint tuned by Neural Compressor (default: ./)')
+parser.add_argument('--int8', dest='int8', action='store_true',
+                    help='run benchmark')
+parser.add_argument('--device', default='hpu', type=str,
+                    help='use hpu device for fp8 quantization')
+
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+    
+    if 'mobilenet' in args.arch:
+        import torchvision.models.quantization as models
+    else:
+        import torchvision.models as models
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch]()
+
+    # define loss function (criterion) and optimizer
+    criterion = nn.CrossEntropyLoss()
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    train_dataset = datasets.ImageFolder(
+        traindir,
+        transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=True,
+        num_workers=args.workers, pin_memory=True, sampler=None)
+
+    val_dataset = datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            normalize,
+        ]))
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True)
+
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+    def eval_func(model):
+        accu = validate(val_loader, model, criterion, args)
+        return float(accu)
+
+    if args.tune:
+        qconfig = FP8Config(fp8_config="E4M3")
+        model = prepare(model, qconfig)
+
+        # Calibrate
+        # model is moved to HPU device automatically after preparing
+        with torch.no_grad():
+            for i, (images, target) in enumerate(train_loader):
+                print("Calibrating batch:", i)
+                if i == args.calib_iters:
+                    break
+                images = images.to(args.device)
+                model(images)
+                htcore.mark_step()
+
+        model = convert(model)
+        eval_func(model)
+        # The saving and loading of fp8 quantization are planned in the next release.
+    
+    if args.performance or args.accuracy:
+        model.eval()
+        if args.int8:
+            from neural_compressor.utils.pytorch import load
+            new_model = load(os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
+                             model,
+                             dataloader=val_loader)
+        else:
+            new_model = model
+        if args.performance:
+            from neural_compressor.config import BenchmarkConfig
+            from neural_compressor import benchmark
+            b_conf = BenchmarkConfig(warmup=5,
+                                     iteration=args.iter,
+                                     cores_per_instance=4,
+                                     num_of_instance=1)
+            benchmark.fit(new_model, b_conf, b_dataloader=val_loader)
+        if args.accuracy:
+            validate(val_loader, new_model, criterion, args)
+        return
+
+
+def train(train_loader, model, criterion, optimizer, epoch, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(len(train_loader), batch_time, data_time, losses, top1,
+                             top5, prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (input, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        if args.gpu is not None:
+            input = input.cuda(args.gpu, non_blocking=True)
+            target = target.cuda(args.gpu, non_blocking=True)
+
+        # compute output
+        output = model(input)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), input.size(0))
+        top1.update(acc1[0], input.size(0))
+        top5.update(acc5[0], input.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            progress.print(i)
+
+
+def validate(val_loader, model, criterion, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(len(val_loader), batch_time, losses, top1, top5,
+                             prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    with torch.no_grad():
+        for i, (input, target) in enumerate(val_loader):
+            if i >= args.warmup_iter:
+                start = time.time()
+            input = input.to(args.device)
+            target = target.to(args.device)
+            if args.gpu is not None:
+                input = input.cuda(args.gpu, non_blocking=True)
+                target = target.cuda(args.gpu, non_blocking=True)
+
+            # compute output
+            output = model(input)
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            top1.update(acc1[0], input.size(0))
+            top5.update(acc5[0], input.size(0))
+
+            # measure elapsed time
+            if i >= args.warmup_iter:
+                batch_time.update(time.time() - start)
+
+            if i % args.print_freq == 0:
+                progress.print(i)
+
+            if args.iter > 0 and i >= (args.warmup_iter + args.iter - 1):
+                break
+
+        print('Batch size = %d' % args.batch_size)
+        print('Accuracy: {top1:.5f} Accuracy@5 {top5:.5f}'
+              .format(top1=(top1.avg / 100), top5=(top5.avg / 100)))
+
+    return top1.avg/100
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, *meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def print(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    lr = args.lr * (0.1 ** (epoch // 30))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/requirements.txt b/examples/3.x_api/pytorch/cv/fp8_quant/requirements.txt
new file mode 100644
index 00000000000..ebd3df6ae7a
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/fp8_quant/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+neural-compressor
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/fp8_quant/run_quant.sh
new file mode 100644
index 00000000000..4d0047cf2d1
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/fp8_quant/run_quant.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  output_model=saved_results
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+      --output_model=*)
+          output_model=$(echo $var |cut -f2 -d=)
+      ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    if [ "${topology}" = "resnet18_fp8_static" ]; then
+        input_model="resnet18"
+        output_dir="saved_results"
+    fi
+    python main.py \
+            --pretrained \
+            -t \
+            -a ${input_model} \
+            -b 30 \
+            --tuned_checkpoint ${output_model} \
+            ${dataset_location}
+}
+
+main "$@"
diff --git a/neural_compressor/common/base_config.py b/neural_compressor/common/base_config.py
index 09032360cc0..d54e2e6515b 100644
--- a/neural_compressor/common/base_config.py
+++ b/neural_compressor/common/base_config.py
@@ -379,8 +379,10 @@ def to_json_file(self, filename):
         Args:
             filename (str): The path to save the JSON file.
         """
-        # Implementation details omitted for brevity
-        pass
+        config_dict = self.to_dict()
+        with open(filename, "w", encoding="utf-8") as file:
+            json.dump(config_dict, file, indent=4)
+        logger.info("Dump the config into %s.", filename)
 
     def to_json_string(self, use_diff: bool = False) -> str:
         """Serializes this instance to a JSON string.
diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py
index 9bce5b39a37..163509a6048 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/common.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/common.py
@@ -91,8 +91,8 @@ def restore_patched_module(patched_model):
                 class_name_org = (
                     getattr(patched_mod, "class_name_org", None) or patched_mod.__class__.__name__.split("Patched")[-1]
                 )
+                patched_mod.__dict__.pop("forward", None)
                 origin_mod = helper_mods[class_name_org](patched_mod)
-                origin_mod.forward = patched_mod.forward_orig
                 setattr(parent, name, origin_mod)
 
 
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
index eb71a550782..88735312713 100644
--- a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
+++ b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
@@ -3,9 +3,10 @@
 
 import pytest
 import torch
+import torchvision
 import transformers
 
-from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedLinear
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedLinear, PatchedConv2d
 from neural_compressor.torch.quantization import (
     FP8Config,
     convert,
@@ -31,24 +32,57 @@ def setup_class(self):
             "hf-internal-testing/tiny-random-GPTJForCausalLM",
             device_map="cpu",
         )
-        self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long)
+        self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to("hpu")
+        self.resnet18 = torchvision.models.resnet18(pretrained=True)
+        self.cv_dummy_inputs = torch.randn([1,3,224,224]).to("hpu")
 
     def teardown_class(self):
         shutil.rmtree("test_ouputs", ignore_errors=True)
 
-    def test_one_step_quant(self):
+    def test_one_step_quant_nlp(self):
         model = copy.deepcopy(self.tiny_gptj)
+        model.to('hpu')
+        fp32_out = model(self.example_inputs)[0]
         qconfig = FP8Config(fp8_config="E4M3")
         model = prepare(model, qconfig)
         assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not prepared."
         calib_func(model)
         model = convert(model)
+        fp8_out = model(self.example_inputs)[0]
         assert isinstance(model.transformer.h[0].attn.k_proj, PatchedLinear), "k_proj is not quantized."
         assert (
             model.transformer.h[0].attn.k_proj.quant_input.lp_dtype == torch.float8_e4m3fn
         ), "k_proj input dtype is not torch.float8_e4m3fn."
+        assert (fp32_out != fp8_out).any(), "FP32 output should be different with FP8 output"
+        print((fp32_out - fp8_out).abs().max())
+        assert torch.allclose(fp32_out, fp8_out, atol=0.04), "Accuracy gap atol > 0.04 is unexpected."
 
-    def test_two_step_quant(self):
+    # @pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
+    def test_one_step_quant_cv(self):
+        model = copy.deepcopy(self.resnet18)
+        model.to('hpu')
+        fp32_out = model(self.cv_dummy_inputs)
+        # model.to('cpu')
+        qconfig = FP8Config(fp8_config="E4M3")
+        model = prepare(model, qconfig)
+        assert model.fc.weight.device.type == "hpu", "model is not mapped to HPU."
+        assert (
+            isinstance(model.fc, PatchedLinear) and 
+            isinstance(model.conv1, PatchedConv2d)
+        ), "model is not prepared."
+        # calibration
+        model(self.cv_dummy_inputs)
+        model = convert(model)
+        fp8_out = model(self.cv_dummy_inputs)
+        assert (
+            isinstance(model.fc, PatchedLinear) and
+            isinstance(model.conv1, PatchedConv2d) and
+            model.fc.quant_input.lp_dtype == torch.float8_e4m3fn and
+            model.conv1.quant_input.lp_dtype == torch.float8_e4m3fn
+        ), "model is not quantized to torch.float8_e4m3fn."
+        assert (fp32_out != fp8_out).any(), "FP32 output should be different with FP8 output"
+
+    def test_two_step_quant_nlp(self):
         # step 1: measurement
         model = copy.deepcopy(self.tiny_gptj)
         config = FP8Config.from_json_file("test_fp8_jsons/test_measure.json")
@@ -64,3 +98,27 @@ def test_two_step_quant(self):
         assert (
             model.transformer.h[0].attn.k_proj.quant_input.lp_dtype == torch.float8_e4m3fn
         ), "k_proj input dtype is not torch.float8_e4m3fn."
+
+    def test_two_step_quant_cv(self):
+        # step 1: measurement
+        model = copy.deepcopy(self.resnet18)
+        config = FP8Config.from_json_file("test_fp8_jsons/test_measure.json")
+        model = prepare(model, config)
+        fp32_out = model(self.cv_dummy_inputs)
+        finalize_calibration(model)
+        assert (
+            isinstance(model.fc, PatchedLinear) and 
+            isinstance(model.conv1, PatchedConv2d)
+        ), "model is not prepared."
+        # step 2: quantize based on measurement
+        model = copy.deepcopy(self.resnet18)
+        config = FP8Config.from_json_file("test_fp8_jsons/test_hw_quant.json")
+        model = convert(model, config)
+        fp8_out = model(self.cv_dummy_inputs)
+        assert (
+            isinstance(model.fc, PatchedLinear) and
+            isinstance(model.conv1, PatchedConv2d) and
+            model.fc.quant_input.lp_dtype == torch.float8_e4m3fn and
+            model.conv1.quant_input.lp_dtype == torch.float8_e4m3fn
+        ), "model is not quantized to torch.float8_e4m3fn."
+        assert (fp32_out != fp8_out).any(), "FP32 output should be different with FP8 output"
diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index c17e22d6f77..6605132ff6f 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -6,3 +6,4 @@ prettytable
 psutil
 pytest
 transformers
+torchvision

From 051fee8023f53fc6e67a21996e2aa68d4202a4a8 Mon Sep 17 00:00:00 2001
From: xinhe <xin3.he@intel.com>
Date: Tue, 30 Jul 2024 10:11:48 +0800
Subject: [PATCH 23/51] Update settings to be compatible with gerrit

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 42b471dc283..977aa19a1ae 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,8 @@ def get_build_version():
         return __version__
     try:
         result = subprocess.run(["git", "describe", "--tags"], capture_output=True, text=True, check=True)
-        _, distance, commit = result.stdout.strip().split("-")
+        distance = result.stdout.strip().split("-")[-2]
+        commit = result.stdout.strip().split("-")[-1]
         return f"{__version__}.dev{distance}+{commit}"
     except subprocess.CalledProcessError:
         return __version__

From 273787091e987ad6b2f4a7669a081c33730e8d08 Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Tue, 30 Jul 2024 09:39:50 +0000
Subject: [PATCH 24/51] enhance ut

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 .../quantization/weight_only/test_load.py     | 38 +++++--------------
 .../weight_only/test_load_woq_hf_model.py     | 24 ------------
 2 files changed, 9 insertions(+), 53 deletions(-)
 delete mode 100644 test/3x/torch/quantization/weight_only/test_load_woq_hf_model.py

diff --git a/test/3x/torch/quantization/weight_only/test_load.py b/test/3x/torch/quantization/weight_only/test_load.py
index 03b58b2adcc..28b2f4ab3d4 100644
--- a/test/3x/torch/quantization/weight_only/test_load.py
+++ b/test/3x/torch/quantization/weight_only/test_load.py
@@ -16,13 +16,16 @@ class TestHFModelLoad:
     def setup_class(self):
         self.model_name = "TheBloke/TinyLlama-1.1B-python-v0.1-GPTQ"
         self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to(device)
+        self.local_cache = "local_cache"
 
-        self.local_hf_model = "./TinyLlama-1.1B-python-v0.1-GPTQ"
+        self.local_hf_model = "TinyLlama-1.1B-Chat-v0.1-GPTQ"
         huggingface_hub.snapshot_download(self.model_name, local_dir=self.local_hf_model)
 
     def teardown_class(self):
         shutil.rmtree("TinyLlama-1.1B-python-v0.1-GPTQ", ignore_errors=True)
         shutil.rmtree("saved_results", ignore_errors=True)
+        shutil.rmtree("nc_workspace", ignore_errors=True)
+        shutil.rmtree("local_cache", ignore_errors=True)
 
     def get_woq_linear_num(self, model, woq_module_type_name):
         woq_linear_num = 0
@@ -54,12 +57,14 @@ def test_load_hf_woq_model_cpu(self):
 
     @pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
     def test_load_hf_woq_model_hpu(self):
-        # 1. use huggingface model_id (format=huggingface, device="hpu")
+        # use huggingface model_id (format=huggingface, device="hpu")
         # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
         model = load(
             model_name_or_path=self.model_name,
             format="huggingface",
             device="hpu",
+            torch_dtype=torch.bfloat16,
+            cache_dir=self.local_cache,
         )
         assert (
             self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
@@ -71,33 +76,8 @@ def test_load_hf_woq_model_hpu(self):
             model_name_or_path=self.model_name,
             format="huggingface",
             device="hpu",
-        )
-        assert (
-            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
-        ), "Incorrect number of HPUWeightOnlyLinear modules"
-        output2 = model(self.example_inputs)[0]
-
-        assert torch.equal(
-            output1, output2
-        ), "The model loaded the second time is different from the model loaded the first time"
-
-        # 2. use huggingface local model_path (format=huggingface, device="hpu")
-        # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save hpu_model.safetensors to local cache dir
-        model = load(
-            model_name_or_path=self.local_hf_model,
-            format="huggingface",
-            device="hpu",
-        )
-        assert (
-            self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
-        ), "Incorrect number of HPUWeightOnlyLinear modules"
-        output1 = model(self.example_inputs)[0]
-
-        # second load: linear -> HPUWeightOnlyLinear using hpu_model.safetensors saved in local cache dir
-        model = load(
-            model_name_or_path=self.local_hf_model,
-            format="huggingface",
-            device="hpu",
+            torch_dtype=torch.bfloat16,
+            cache_dir=self.local_cache,
         )
         assert (
             self.get_woq_linear_num(model, "HPUWeightOnlyLinear") == 154
diff --git a/test/3x/torch/quantization/weight_only/test_load_woq_hf_model.py b/test/3x/torch/quantization/weight_only/test_load_woq_hf_model.py
deleted file mode 100644
index c12197d211c..00000000000
--- a/test/3x/torch/quantization/weight_only/test_load_woq_hf_model.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import torch
-
-from neural_compressor.torch.utils import accelerator
-
-device = accelerator.current_device_name()
-
-
-class TestHFModelLoad:
-    def setup_class(self):
-        self.model_name = "TheBloke/TinyLlama-1.1B-python-v0.1-GPTQ"
-        self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to(device)
-
-    def test_load_hf_woq_model(self):
-        from neural_compressor.torch.quantization import load
-
-        qmodel = load(model_name_or_path=self.model_name, format="huggingface", torch_dtype=torch.float32)
-
-        woq_linear_num = 0
-        for _, module in qmodel.named_modules():
-            if module.__class__.__name__ == "WeightOnlyLinear":
-                woq_linear_num += 1
-        assert woq_linear_num == 154, "Incorrect number of WeightOnlyLinear modules"
-        output = qmodel(self.example_inputs)[0]
-        assert len(output) > 0, "Not loading the model correctly"

From 402c16f599854b67055e3f89a119f0ac3da31b4d Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Thu, 1 Aug 2024 12:45:49 +0800
Subject: [PATCH 25/51] move fp8 sample to helloworld folder

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 examples/{fp8_sample => helloworld/fp8_example}/README.md         | 0
 .../{fp8_sample => helloworld/fp8_example}/maxabs_measure.json    | 0
 examples/{fp8_sample => helloworld/fp8_example}/maxabs_quant.json | 0
 examples/{fp8_sample => helloworld/fp8_example}/quant_config.json | 0
 .../{fp8_sample => helloworld/fp8_example}/sample_one_step.py     | 0
 .../{fp8_sample => helloworld/fp8_example}/sample_two_steps.py    | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/{fp8_sample => helloworld/fp8_example}/README.md (100%)
 rename examples/{fp8_sample => helloworld/fp8_example}/maxabs_measure.json (100%)
 rename examples/{fp8_sample => helloworld/fp8_example}/maxabs_quant.json (100%)
 rename examples/{fp8_sample => helloworld/fp8_example}/quant_config.json (100%)
 rename examples/{fp8_sample => helloworld/fp8_example}/sample_one_step.py (100%)
 rename examples/{fp8_sample => helloworld/fp8_example}/sample_two_steps.py (100%)

diff --git a/examples/fp8_sample/README.md b/examples/helloworld/fp8_example/README.md
similarity index 100%
rename from examples/fp8_sample/README.md
rename to examples/helloworld/fp8_example/README.md
diff --git a/examples/fp8_sample/maxabs_measure.json b/examples/helloworld/fp8_example/maxabs_measure.json
similarity index 100%
rename from examples/fp8_sample/maxabs_measure.json
rename to examples/helloworld/fp8_example/maxabs_measure.json
diff --git a/examples/fp8_sample/maxabs_quant.json b/examples/helloworld/fp8_example/maxabs_quant.json
similarity index 100%
rename from examples/fp8_sample/maxabs_quant.json
rename to examples/helloworld/fp8_example/maxabs_quant.json
diff --git a/examples/fp8_sample/quant_config.json b/examples/helloworld/fp8_example/quant_config.json
similarity index 100%
rename from examples/fp8_sample/quant_config.json
rename to examples/helloworld/fp8_example/quant_config.json
diff --git a/examples/fp8_sample/sample_one_step.py b/examples/helloworld/fp8_example/sample_one_step.py
similarity index 100%
rename from examples/fp8_sample/sample_one_step.py
rename to examples/helloworld/fp8_example/sample_one_step.py
diff --git a/examples/fp8_sample/sample_two_steps.py b/examples/helloworld/fp8_example/sample_two_steps.py
similarity index 100%
rename from examples/fp8_sample/sample_two_steps.py
rename to examples/helloworld/fp8_example/sample_two_steps.py

From 34855fa1bdfee43f0a6baf36220b77fa9c92d083 Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 6 Aug 2024 05:46:29 +0300
Subject: [PATCH 26/51] update torch version of habana docker

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 README.md                                       | 2 +-
 examples/3.x_api/pytorch/cv/fp8_quant/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e7ed35bf5cc..78dcefb3589 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ Following example code demonstrates FP8 Quantization, it is supported by Intel G
 To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
 ```bash
 # Run a container with an interactive shell
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 Run the example:
 ```python
diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/README.md b/examples/3.x_api/pytorch/cv/fp8_quant/README.md
index ebad25f9f05..df462c3b7b2 100644
--- a/examples/3.x_api/pytorch/cv/fp8_quant/README.md
+++ b/examples/3.x_api/pytorch/cv/fp8_quant/README.md
@@ -7,7 +7,7 @@ This implements FP8 quantization of popular model architectures, such as ResNet
 To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built). 
 ```bash
 # Run a container with an interactive shell
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
 - Install requirements

From 8c57adb1031077513c0ce34d1b72773a8ea15fed Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 6 Aug 2024 02:57:57 +0000
Subject: [PATCH 27/51] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../internal/diffusion_evaluation/README      |  2 +-
 .../SR_evaluation/README.md                   |  4 +-
 .../SR_evaluation/create_SR_dataset.py        | 74 ++++++++++------
 .../imagenet1000_clsidx_to_labels.txt         |  2 +-
 .../SR_evaluation/super_res_eval.py           | 63 +++++++++-----
 .../diffusion_evaluation/create_dataset.py    | 80 ++++++++++-------
 .../diffusion_evaluation/evaluator.py         | 86 +++++++++++--------
 .../imagenet_quant.py                         | 59 ++++++++-----
 .../inference_quant_examples/run_example.sh   | 14 +++
 .../torch/algorithms/weight_only/modules.py   | 31 +++----
 .../torch/algorithms/weight_only/rtn.py       |  2 +-
 .../fp8_quant/test_fp8_static_quant.py        | 34 +++-----
 .../quantization/weight_only/test_awq.py      |  1 +
 test/3x/torch/requirements.txt                |  2 +-
 14 files changed, 272 insertions(+), 182 deletions(-)

diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README
index 3a71ca76a5e..dfd014918d4 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/README
@@ -18,7 +18,7 @@ Now, create the generated images from the csv file
 
 IMPORTANT!! - the script that does the actual evaluation (explained below) expects to get an image where the prompt is the title of the image. For example, if the prompt is "a monster playing the guitar" then the name of the file that is created using diffusion should be "<path>/a monster playing the guitar.png" (or jpg or whatever)
 
-IMPORTANT!! #2 - from my experience, stable diffusion inference returns an error for prompts with the character '/' in them. There are very few, around one in a thousand. My recomendation, if you want to evaluate N images, create a subset of the size N+30 and delete prompts with '/' in them.  After creating the CSV I just deleted these prompts manually (takes 10 seconds to do).
+IMPORTANT!! #2 - from my experience, stable diffusion inference returns an error for prompts with the character '/' in them. There are very few, around one in a thousand. My recommendation, if you want to evaluate N images, create a subset of the size N+30 and delete prompts with '/' in them.  After creating the CSV I just deleted these prompts manually (takes 10 seconds to do).
 (Perhaps automating this should be a future commit).
 
 2) Now, run the evaluation script. This does the following:
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md
index 208a9b8d81c..f837681413e 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/README.md
@@ -8,7 +8,7 @@ the saved image is its label.
 3) Downsample the images to be 64*64 (using bicubic interpolation) and then restore them using Super Resolution. 
 4) Calculate  PSNR and SSIM between each ground truth image and restored image, and print the mean.
 
-Steps 1,2 and 4 are inluded here, while step 3 (downsampling and restoring) should be done seperately, using the 
+Steps 1,2 and 4 are included here, while step 3 (downsampling and restoring) should be done separately, using the 
 desired Super Resolution method. Keep in mind that this script assumes that the images are stored in a specific format, 
 (detailed later). Later, the restored images path should be given as an input to step 4.
 
@@ -34,4 +34,4 @@ Now, run the evaluation script, which calculates PSNR and SSIM and prints the me
 
 To do this, run:
 
-python super_res_eval.py --num_images <desired number of images up to 50000> --real_images <real images path> --gen_images <generated images path>
\ No newline at end of file
+python super_res_eval.py --num_images <desired number of images up to 50000> --real_images <real images path> --gen_images <generated images path>
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py
index 6f037568c10..48050df801c 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/create_SR_dataset.py
@@ -1,14 +1,28 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
 import os
+
+import torch
 import torch.nn.parallel
 import torch.optim
 import torch.utils.data
-import torch
-import torchvision.transforms as transforms
 import torchvision.datasets as datasets
-from torchvision.utils import save_image
-import argparse
-
+import torchvision.transforms as transforms
 from torchvision.transforms import functional as F
+from torchvision.utils import save_image
 
 
 class CenterCropAndResize(object):
@@ -22,18 +36,17 @@ def __call__(self, img):
         resize = F.resize(crop, self.size)
         return resize
 
-def get_data_loader(path, dataset="ImageNet",
-                    workers=4, shuffle=None, pin_memory=True, resize = 256):
-    
-    #Data loader for ImageNet data.
 
+def get_data_loader(path, dataset="ImageNet", workers=4, shuffle=None, pin_memory=True, resize=256):
+
+    # Data loader for ImageNet data.
 
     # defines desired resize amd creates dataset
     def get_dataset(path_to_data):
         transformations = [CenterCropAndResize(resize), transforms.ToTensor()]
         return datasets.ImageFolder(path_to_data, transforms.Compose(transformations))
 
-    # checks if given path is valid  
+    # checks if given path is valid
     if isinstance(path, str):
         curr_path = path
         if not os.path.exists(curr_path):
@@ -45,29 +58,36 @@ def get_dataset(path_to_data):
                 curr_path = path_
                 break
         else:
-            raise FileNotFoundError(f"None of the default data directories exist in your env,"
-                                    f" please manually specify one")
-        data_dir = os.path.join(curr_path, 'val')
+            raise FileNotFoundError(
+                "None of the default data directories exist in your env," " please manually specify one"
+            )
+        data_dir = os.path.join(curr_path, "val")
     else:
         raise ValueError("get_data_loader expects list of paths or single path")
 
     # create dataloader from dataset
     dataset = get_dataset(data_dir)
     data_loader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=1, shuffle=shuffle,
-        num_workers=workers, pin_memory=pin_memory)
+        dataset, batch_size=1, shuffle=shuffle, num_workers=workers, pin_memory=pin_memory
+    )
 
     return data_loader
 
-parser = argparse.ArgumentParser('Create dataset of real images for SR evaluation', add_help=False)
 
-parser.add_argument('--images', type = str, help = 'path to imagenet validation set')
-parser.add_argument('--out_dir', type = str, help = 'path to save images with correct format (cropped + modified file name)')
-parser.add_argument('--resize', type = int, default = 256, help = 'dimensions to resize image')
-parser.add_argument('--class_to_labels', type = str, default = 'imagenet1000_clsidx_to_labels.txt', help = 'path to text file containing' 
-        'mapping between class index and label')
-        
+parser = argparse.ArgumentParser("Create dataset of real images for SR evaluation", add_help=False)
+
+parser.add_argument("--images", type=str, help="path to imagenet validation set")
+parser.add_argument(
+    "--out_dir", type=str, help="path to save images with correct format (cropped + modified file name)"
+)
+parser.add_argument("--resize", type=int, default=256, help="dimensions to resize image")
+parser.add_argument(
+    "--class_to_labels",
+    type=str,
+    default="imagenet1000_clsidx_to_labels.txt",
+    help="path to text file containing" "mapping between class index and label",
+)
+
 args = parser.parse_args()
 images = args.images
 out_dir = args.out_dir
@@ -76,12 +96,12 @@ def get_dataset(path_to_data):
 
 with torch.no_grad():
     # get dataloader
-    dl = get_data_loader(images, resize = resize)
+    dl = get_data_loader(images, resize=resize)
 
-    # open idx2label, which matches an integer signifying class with the correcs label
+    # open idx2label, which matches an integer signifying class with the corrects label
     idx2label = eval(open(class_to_labels).read())
 
     # save images with correct filename
-    for i,image in enumerate(dl):
+    for i, image in enumerate(dl):
         label = idx2label.get(image[1].item())
-        save_image(image[0], f'{out_dir}/{label}_{i}.png')
\ No newline at end of file
+        save_image(image[0], f"{out_dir}/{label}_{i}.png")
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
index 2e3ae32a215..33515afe08e 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
@@ -507,7 +507,7 @@
  506: 'coil, spiral, volute, whorl, helix',
  507: 'combination lock',
  508: 'computer keyboard, keypad',
- 509: 'confectionery, confectionary, candy store',
+ 509: 'confectionery, confectionery, candy store',
  510: 'container ship, containership, container vessel',
  511: 'convertible',
  512: 'corkscrew, bottle screw',
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py
index ec2eaeae363..def42a97061 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/super_res_eval.py
@@ -1,34 +1,51 @@
-from PIL import Image
-import torch
-import glob
-from tqdm import tqdm
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
-from torchmetrics import PSNR #PeakSignalNoiseRatio as PSNR
-from torchmetrics import SSIM #StructuralSimilarityIndexMeasure as SSIM
+import glob
+
 import numpy as np
+import torch
+from PIL import Image
+from torchmetrics import PSNR  # PeakSignalNoiseRatio as PSNR
+from torchmetrics import SSIM  # StructuralSimilarityIndexMeasure as SSIM
 from torchvision import transforms
+from tqdm import tqdm
 
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
+device = "cuda" if torch.cuda.is_available() else "cpu"
 
-parser = argparse.ArgumentParser('evaluate Super Resolution using SSIM and PSNR', add_help=False)
-parser.add_argument('--num_images', default=10, type=int, help='number of images to use')
-parser.add_argument('--real_images', type = str, default = '/datasets/imagenet/val_cropped_labeled', help = 'path to real images')
-parser.add_argument('--gen_images', type = str, help = 'path to real images')
+parser = argparse.ArgumentParser("evaluate Super Resolution using SSIM and PSNR", add_help=False)
+parser.add_argument("--num_images", default=10, type=int, help="number of images to use")
+parser.add_argument(
+    "--real_images", type=str, default="/datasets/imagenet/val_cropped_labeled", help="path to real images"
+)
+parser.add_argument("--gen_images", type=str, help="path to real images")
 
 args = parser.parse_args()
 num_images = args.num_images
-gen_image_path = args.gen_images + '/*'
-real_image_path = args.real_images + '/*'
+gen_image_path = args.gen_images + "/*"
+real_image_path = args.real_images + "/*"
 
-# define transform PIL to tensor, used later in loop because 
-# metrics need to receive tensor 
+# define transform PIL to tensor, used later in loop because
+# metrics need to receive tensor
 transform = transforms.Compose([transforms.PILToTensor()])
 
-# import metrics 
+# import metrics
 psnr = PSNR().to(device)
 ssim = SSIM().to(device)
 
-# list of metric for each image 
+# list of metric for each image
 psnr_distances = []
 ssim_distances = []
 
@@ -48,23 +65,23 @@
         psnr_res = psnr(real_image, gen_image)
         psnr_res = psnr_res.item()
         psnr_distances.append(psnr_res)
-        
-        ssim_res = ssim(torch.unsqueeze(real_image, dim=0),torch.unsqueeze(gen_image, dim=0))
+
+        ssim_res = ssim(torch.unsqueeze(real_image, dim=0), torch.unsqueeze(gen_image, dim=0))
         ssim_res = ssim_res.item()
         ssim_distances.append(ssim_res)
-    
+
         # to avoid out of memory
         ssim.reset()
         psnr.reset()
 
-    # turn list into a numpy array to calcultate average
+    # turn list into a numpy array to calculate average
     try:
         psnr_distance = np.array(psnr_distances)
         ssim_distance = np.array(ssim_distances)
     except:
-        print(f"error: no files in requested path")
+        print("error: no files in requested path")
         quit()
 
-#calculate mean and print
+# calculate mean and print
 print(f"mean psnr is {np.mean(psnr_distance)}")
 print(f"mean ssim is {np.mean(ssim_distance)}")
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py
index 6920e4f96c5..17ed08b818f 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/create_dataset.py
@@ -1,18 +1,34 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+
 # load data
 import json
-import shutil
 import os
-import glob
+import shutil
 from pathlib import Path
-import pandas as pd
+
 import numpy as np
-import argparse
+import pandas as pd
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--images_path', type=str, help='path to original dataset')
-parser.add_argument('--save_path', type=str, help='path to save images to')
-parser.add_argument('--num_of_images', type=int, help='number of images to save')
-parser.add_argument('--num_of_gpus', type=int, default=1, help='split csv to num_of_gpu files, to train in parallel')
+parser.add_argument("--images_path", type=str, help="path to original dataset")
+parser.add_argument("--save_path", type=str, help="path to save images to")
+parser.add_argument("--num_of_images", type=int, help="number of images to save")
+parser.add_argument("--num_of_gpus", type=int, default=1, help="split csv to num_of_gpu files, to train in parallel")
 opt = parser.parse_args()
 
 dataset_path = opt.images_path
@@ -21,62 +37,66 @@
 n_samples = opt.num_of_images
 
 
-annotations_path = dataset_path + '/annotations/captions_val2017.json'
-images_path = dataset_path + '/val2017/'
+annotations_path = dataset_path + "/annotations/captions_val2017.json"
+images_path = dataset_path + "/val2017/"
 
 annotations_file = json.load(open(annotations_path))
 
 if not os.path.exists(save_path):
     os.mkdir(save_path)
-    
-save_path_images = save_path + '/subset'
+
+save_path_images = save_path + "/subset"
 
 
 # merge images and annotations
-images = annotations_file['images']
-annotations = annotations_file['annotations']
+images = annotations_file["images"]
+annotations = annotations_file["annotations"]
 df = pd.DataFrame(images)
 df_annotations = pd.DataFrame(annotations)
-df = df.merge(pd.DataFrame(annotations), how='left', left_on='id', right_on='image_id')
+df = df.merge(pd.DataFrame(annotations), how="left", left_on="id", right_on="image_id")
 
 # keep only the relevant columns
-df = df[['file_name', 'caption']]
+df = df[["file_name", "caption"]]
 
 
 # shuffle the dataset
 df = df.sample(frac=1)
 
 # remove duplicate images
-df = df.drop_duplicates(subset='file_name')
+df = df.drop_duplicates(subset="file_name")
 
 # sample from data, remove slashes (because prompts with slashes are a problem for SD)
 df_sample = df.sample(n_samples)
-df_sample = df_sample[~df_sample.iloc[:, 1].str.contains('/')]
+df_sample = df_sample[~df_sample.iloc[:, 1].str.contains("/")]
 
 
-num_per_file = int(np.floor(df_sample.shape[0]/num_of_gpus))
+num_per_file = int(np.floor(df_sample.shape[0] / num_of_gpus))
 for i in range(num_of_gpus):
-#save captions to csv
-    #rename old csv
+    # save captions to csv
+    # rename old csv
     save_path_csv = save_path + f"/subset_{i}.csv"
     if os.path.exists(save_path_csv):
-        num_csv = len(glob.glob(save_path+'/*.csv'))
-        os.rename(save_path_csv,save_path + f'/old_{num_csv}_subset_{i-num_of_gpus}.csv')
-        df_sample['caption'].iloc[num_per_file*i+1:num_per_file*(1+i)+1].to_csv(save_path_csv, index=False,header=False)
+        num_csv = len(glob.glob(save_path + "/*.csv"))
+        os.rename(save_path_csv, save_path + f"/old_{num_csv}_subset_{i-num_of_gpus}.csv")
+        df_sample["caption"].iloc[num_per_file * i + 1 : num_per_file * (1 + i) + 1].to_csv(
+            save_path_csv, index=False, header=False
+        )
         print("Already found a csv named 'subset'")
         print(f"Renamed it as '/old_{num_csv}_subset.csv")
 
     else:
-        df_sample['caption'].iloc[num_per_file*i:num_per_file*(1+i)].to_csv(save_path_csv, index=False,header=False)
+        df_sample["caption"].iloc[num_per_file * i : num_per_file * (1 + i)].to_csv(
+            save_path_csv, index=False, header=False
+        )
 
 print("Saved your new csv to " + save_path)
 
-#clean folder of images
-files = glob.glob(save_path_images+'/*.jpg')
+# clean folder of images
+files = glob.glob(save_path_images + "/*.jpg")
 if os.path.exists(save_path_images):
     print("Already found a folder named subset")
     num_subsets = len(list(os.walk(save_path)))
-    os.rename(save_path_images,save_path + f'/old_{num_subsets}_subset')
+    os.rename(save_path_images, save_path + f"/old_{num_subsets}_subset")
     print(f"Renamed it as '/old_{num_subsets}_subset")
 
 
@@ -84,7 +104,7 @@
 subset_path = Path(save_path_images)
 subset_path.mkdir(exist_ok=True)
 for i, row in df_sample.iterrows():
-    path = images_path  + row['file_name']
+    path = images_path + row["file_name"]
     shutil.copy(path, subset_path)
 
-print(f"saved your images to " + save_path_images)
+print("saved your images to " + save_path_images)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py
index 3a5b1ae3f55..2bfaaea8790 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/evaluator.py
@@ -1,44 +1,58 @@
-from PIL import Image
-import torch
-from transformers import CLIPProcessor, CLIPModel
-from torchvision import transforms
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import glob
+import os
 import random
+
 import numpy as np
-import argparse
+import torch
+from PIL import Image
 from torchmetrics.image.fid import FrechetInceptionDistance
-import os
-import glob
+from torchvision import transforms
 from tqdm import tqdm
+from transformers import CLIPModel, CLIPProcessor
 
 torch.manual_seed(0)
 random.seed(0)
 np.random.seed(0)
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--device', type=str, default="hpu", help='the device to use', choices=['cpu', 'cuda', 'hpu'])
-parser.add_argument('--real_images_path', type=str, help='path to real images')
-parser.add_argument('--diff_images_path', type=str, help='path to images generated from diffusion')
-parser.add_argument('--num_of_images', type=int, help='number of images to evaluate with')
+parser.add_argument("--device", type=str, default="hpu", help="the device to use", choices=["cpu", "cuda", "hpu"])
+parser.add_argument("--real_images_path", type=str, help="path to real images")
+parser.add_argument("--diff_images_path", type=str, help="path to images generated from diffusion")
+parser.add_argument("--num_of_images", type=int, help="number of images to evaluate with")
 opt = parser.parse_args()
 
 real_images_path = opt.real_images_path
 diff_images_path = opt.diff_images_path
 num_of_images = opt.num_of_images
 device = opt.device
-if device == 'hpu':
+if device == "hpu":
     import habana_frameworks.torch.core as core
 
-cosine_sim = torch.nn.CosineSimilarity(dim = 0)
+cosine_sim = torch.nn.CosineSimilarity(dim=0)
 fid = FrechetInceptionDistance(feature=2048).to(device)
 
-real_images_path = real_images_path + '/*'
-diff_images_path = diff_images_path  +  '/*'
+real_images_path = real_images_path + "/*"
+diff_images_path = diff_images_path + "/*"
 files = glob.glob(diff_images_path)
 files_real = glob.glob(real_images_path)
 
-files = random.sample(files,num_of_images)
-files_real = random.sample(files_real,num_of_images)
-
+files = random.sample(files, num_of_images)
+files_real = random.sample(files_real, num_of_images)
 
 
 print("started evaluation")
@@ -47,20 +61,17 @@
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 
 
-
 # Calculate Cosine Distance
 print("finding clip score")
-#Tensor with distance of each image to its label - to be averaged
+# Tensor with distance of each image to its label - to be averaged
 # distances = torch.Tensor([]).to(device)
 distances = []
 diff_img_tensor = torch.Tensor([]).to(device)
 # images must be 299*299 for FID calculation
-transform = transforms.Compose([
-    transforms.PILToTensor(), transforms.Resize((299,299))
-])
+transform = transforms.Compose([transforms.PILToTensor(), transforms.Resize((299, 299))])
 
 with torch.no_grad():
-    for i,curr_image_path in tqdm(enumerate(files)):
+    for i, curr_image_path in tqdm(enumerate(files)):
         # For clip score
         images = Image.open(curr_image_path)
 
@@ -68,35 +79,34 @@
         text = os.path.basename(curr_image_path)
         text = os.path.splitext(text)[0]
 
-        #process image and text, and embed
-        image_processed = processor(images = images, return_tensors='pt').to(device)
+        # process image and text, and embed
+        image_processed = processor(images=images, return_tensors="pt").to(device)
         image_embedding = torch.squeeze(clip.get_image_features(**image_processed))
-        text_processed = processor(text =text, return_tensors='pt').to(device)
+        text_processed = processor(text=text, return_tensors="pt").to(device)
         text_embedding = torch.squeeze(clip.get_text_features(**text_processed))
 
         # calculate cosine distance for ith image
-        cosine_dist = torch.Tensor([cosine_sim(image_embedding,text_embedding)]).to(device)
+        cosine_dist = torch.Tensor([cosine_sim(image_embedding, text_embedding)]).to(device)
         distances.append(cosine_dist)
         # distances = torch.cat((distances,cosine_dist), dim = 0)
 
         # for FID - register these images as real images
-        reshaped_img = torch.unsqueeze(transform(images),dim = 0).to(device)
+        reshaped_img = torch.unsqueeze(transform(images), dim=0).to(device)
         fid.update(reshaped_img.to(dtype=torch.uint8), real=False)
 
     distance = torch.Tensor(len(distances)).to(device)
-    torch.cat(distances, out = distance)
-    print(f"mean cosine distange is {torch.mean(distance)}")
+    torch.cat(distances, out=distance)
+    print(f"mean cosine distance is {torch.mean(distance)}")
     print("finding FID score")
 
     # add real images to FID calculation
     real_images_tensor = torch.Tensor([]).to(device)
 
-    resize_transform = transforms.Resize((299,299))
-    for i,curr_image_path in tqdm(enumerate(files_real)): 
-            images = Image.open(curr_image_path)
-            real_images = torch.unsqueeze(transform(images),dim = 0).to(device)
-            if real_images.shape[1:] == (3,299,299):
-                fid.update(real_images.to(dtype=torch.uint8), real=True)
-
+    resize_transform = transforms.Resize((299, 299))
+    for i, curr_image_path in tqdm(enumerate(files_real)):
+        images = Image.open(curr_image_path)
+        real_images = torch.unsqueeze(transform(images), dim=0).to(device)
+        if real_images.shape[1:] == (3, 299, 299):
+            fid.update(real_images.to(dtype=torch.uint8), real=True)
 
     print(f"The FID is {fid.compute()}")
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py
index c6793859ab3..2938dee8b6a 100644
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/imagenet_quant.py
@@ -1,20 +1,39 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
 import torch
 import torchvision
-import numpy as np 
 import torchvision.transforms as transforms
-import os
-from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import get_hqt_config, QuantMode
-from neural_compressor.torch.quantization import prepare, convert, finalize_calibration, FP8Config
 
 # fp8 additions
 import neural_compressor
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import QuantMode, get_hqt_config
+from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare
 
 # data
-imgnet_data = '/software/data/pytorch/imagenet/ILSVRC2012/val/'
-transform_test = transforms.Compose([
-    transforms.Resize(256), transforms.CenterCrop(224),
-                               transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])
-])
+imgnet_data = "/software/data/pytorch/imagenet/ILSVRC2012/val/"
+transform_test = transforms.Compose(
+    [
+        transforms.Resize(256),
+        transforms.CenterCrop(224),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ]
+)
 testset = torchvision.datasets.ImageFolder(imgnet_data, transform_test)
 testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)
 
@@ -34,42 +53,40 @@
 
 
 # evaluate module
-device = 'hpu'
+device = "hpu"
 model.to(device)
 model.eval()
 
 
-
 def evaluate():
     accuracy = []
-    max_batches = 10 if quant_config['mode'] == QuantMode.MEASURE else 50
-    for i,(images,labels) in enumerate(testloader):
+    max_batches = 10 if quant_config["mode"] == QuantMode.MEASURE else 50
+    for i, (images, labels) in enumerate(testloader):
         images = images.to(device)
-        labels =labels.to(device)
+        labels = labels.to(device)
         output = model(images)
         accurate = 0
         total = 0
-        _,predicted = torch.max(output.data, 1)
+        _, predicted = torch.max(output.data, 1)
         # total labels
-        total+= labels.size(0)
+        total += labels.size(0)
         # Total correct predictions
         accurate += (predicted == labels).sum()
-        accuracy_score = 100 * accurate/total
+        accuracy_score = 100 * accurate / total
         accuracy.append(accuracy_score)
         if max_batches > 0:
             max_batches -= 1
         else:
             break
-        
+
     accuracy = [x.item() for x in accuracy]
     print(np.mean(np.array(accuracy)))
 
+
 with torch.no_grad():
 
     evaluate()
 
     # fp8 additions
-    if quant_config['mode'] == QuantMode.MEASURE:
+    if quant_config["mode"] == QuantMode.MEASURE:
         finalize_calibration(model)
-
-
diff --git a/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh
index d843c49050b..579ec560ed2 100755
--- a/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh
+++ b/neural_compressor/torch/algorithms/fp8_quant/internal/inference_quant_examples/run_example.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # First, run measurements (based on custom_config/measure_config.json)
 QUANT_CONFIG=measure_config python3 imagenet_quant.py
 
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 3a6adbba274..dba3b003604 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -54,11 +54,7 @@ def forward(self, X):
 
 class UnpackedWeightOnlyLinearParams(dict):
     def __init__(self, unpack_weight, scales, unpack_zp, **kwargs):
-        super().__init__(
-            int_weight=unpack_weight,
-            scales=scales,
-            zp=unpack_zp,
-            **kwargs)
+        super().__init__(int_weight=unpack_weight, scales=scales, zp=unpack_zp, **kwargs)
 
     def to(self, device):
         for key, value in self.items():
@@ -66,6 +62,7 @@ def to(self, device):
                 self[key] = value.to(device)
         return self
 
+
 class WeightOnlyLinear(torch.nn.Module):
     """Weight Only Linear."""
 
@@ -817,6 +814,7 @@ def extra_repr(self) -> str:
             tmp_str += ", use_optimum_format=True"
         return tmp_str
 
+
 class HPUWeightOnlyLinear(WeightOnlyLinear):
     def __init__(
         self,
@@ -901,14 +899,12 @@ def forward(self, input):
         output = torch.matmul(input, weight)
         output = output.to(dtype=input_dtype).reshape(
             output_shape
-
         )  # A cast is needed here as for some reason the vecquant2matmul_faster_old still allocate a float32 output.
         output = output + self.bias if self.bias is not None else output
         return output
 
-
     def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
-        logger.debug(f"Packing for HPU")
+        logger.debug("Packing for HPU")
 
         scales = scales.T.contiguous()
         qzeros = zp.T.contiguous()
@@ -929,14 +925,14 @@ def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
             self.bias = bias.to("hpu").to(torch.bfloat16)
 
     def unpack(self):
-        logger.debug(f"Unpacking from HPU")
+        logger.debug("Unpacking from HPU")
         self.qweight = self.qweight.cpu()
         weight = torch.bitwise_right_shift(
-                torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
-                self.wf.unsqueeze(-1),
-            ).to(torch.int16 if self.bits == 8 else torch.int8)
+            torch.unsqueeze(self.qweight, 1).expand(-1, 32 // self.bits, -1),
+            self.wf.unsqueeze(-1),
+        ).to(torch.int16 if self.bits == 8 else torch.int8)
         weight = torch.bitwise_and(weight, (2**self.bits) - 1)
-        weight = weight.reshape((weight.shape[0]*weight.shape[1], weight.shape[2]))
+        weight = weight.reshape((weight.shape[0] * weight.shape[1], weight.shape[2]))
         self.qweight = self.qweight.to(self.device)
 
         zeros = torch.bitwise_right_shift(
@@ -944,14 +940,14 @@ def unpack(self):
             self.wf.unsqueeze(0),
         ).to(torch.int16 if self.bits == 8 else torch.int8)
 
-        zeros = torch.bitwise_and(
-            zeros, (2**self.bits) - 1
-        ).to(self.scales.dtype)  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
+        zeros = torch.bitwise_and(zeros, (2**self.bits) - 1).to(
+            self.scales.dtype
+        )  # NOTE: It appears that casting here after the `zeros = zeros + 1` is important.
         zeros = zeros + 1
         zeros = zeros.reshape(-1, 1, zeros.shape[1] * zeros.shape[2])
         return weight, zeros
 
-    def pack_tensor(self, input, bits = 4):
+    def pack_tensor(self, input, bits=4):
         normal = input.to(torch.int32)
         q = torch.zeros((normal.shape[0], normal.shape[1] // 32 * bits), dtype=torch.int32)
         i = 0
@@ -964,6 +960,7 @@ def pack_tensor(self, input, bits = 4):
         q = q.to(torch.int32)
         return q
 
+
 class FakeAffineTensorQuantFunction(Function):
     """Fake version of affine quantization."""
 
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index a5a9932f08b..fe182ddadd9 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -35,8 +35,8 @@
     set_module,
 )
 
-from .utility import cast_fp8, quant_tensor, search_clip
 from .modules import INCWeightOnlyLinear
+from .utility import cast_fp8, quant_tensor, search_clip
 
 if is_transformers_imported():
     import transformers
diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
index 88735312713..63678521d11 100644
--- a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
+++ b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
@@ -6,7 +6,7 @@
 import torchvision
 import transformers
 
-from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedLinear, PatchedConv2d
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.helper_modules import PatchedConv2d, PatchedLinear
 from neural_compressor.torch.quantization import (
     FP8Config,
     convert,
@@ -34,14 +34,14 @@ def setup_class(self):
         )
         self.example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to("hpu")
         self.resnet18 = torchvision.models.resnet18(pretrained=True)
-        self.cv_dummy_inputs = torch.randn([1,3,224,224]).to("hpu")
+        self.cv_dummy_inputs = torch.randn([1, 3, 224, 224]).to("hpu")
 
     def teardown_class(self):
         shutil.rmtree("test_ouputs", ignore_errors=True)
 
     def test_one_step_quant_nlp(self):
         model = copy.deepcopy(self.tiny_gptj)
-        model.to('hpu')
+        model.to("hpu")
         fp32_out = model(self.example_inputs)[0]
         qconfig = FP8Config(fp8_config="E4M3")
         model = prepare(model, qconfig)
@@ -60,25 +60,22 @@ def test_one_step_quant_nlp(self):
     # @pytest.mark.skipif(not is_hpex_available(), reason="HPU environment is required!")
     def test_one_step_quant_cv(self):
         model = copy.deepcopy(self.resnet18)
-        model.to('hpu')
+        model.to("hpu")
         fp32_out = model(self.cv_dummy_inputs)
         # model.to('cpu')
         qconfig = FP8Config(fp8_config="E4M3")
         model = prepare(model, qconfig)
         assert model.fc.weight.device.type == "hpu", "model is not mapped to HPU."
-        assert (
-            isinstance(model.fc, PatchedLinear) and 
-            isinstance(model.conv1, PatchedConv2d)
-        ), "model is not prepared."
+        assert isinstance(model.fc, PatchedLinear) and isinstance(model.conv1, PatchedConv2d), "model is not prepared."
         # calibration
         model(self.cv_dummy_inputs)
         model = convert(model)
         fp8_out = model(self.cv_dummy_inputs)
         assert (
-            isinstance(model.fc, PatchedLinear) and
-            isinstance(model.conv1, PatchedConv2d) and
-            model.fc.quant_input.lp_dtype == torch.float8_e4m3fn and
-            model.conv1.quant_input.lp_dtype == torch.float8_e4m3fn
+            isinstance(model.fc, PatchedLinear)
+            and isinstance(model.conv1, PatchedConv2d)
+            and model.fc.quant_input.lp_dtype == torch.float8_e4m3fn
+            and model.conv1.quant_input.lp_dtype == torch.float8_e4m3fn
         ), "model is not quantized to torch.float8_e4m3fn."
         assert (fp32_out != fp8_out).any(), "FP32 output should be different with FP8 output"
 
@@ -106,19 +103,16 @@ def test_two_step_quant_cv(self):
         model = prepare(model, config)
         fp32_out = model(self.cv_dummy_inputs)
         finalize_calibration(model)
-        assert (
-            isinstance(model.fc, PatchedLinear) and 
-            isinstance(model.conv1, PatchedConv2d)
-        ), "model is not prepared."
+        assert isinstance(model.fc, PatchedLinear) and isinstance(model.conv1, PatchedConv2d), "model is not prepared."
         # step 2: quantize based on measurement
         model = copy.deepcopy(self.resnet18)
         config = FP8Config.from_json_file("test_fp8_jsons/test_hw_quant.json")
         model = convert(model, config)
         fp8_out = model(self.cv_dummy_inputs)
         assert (
-            isinstance(model.fc, PatchedLinear) and
-            isinstance(model.conv1, PatchedConv2d) and
-            model.fc.quant_input.lp_dtype == torch.float8_e4m3fn and
-            model.conv1.quant_input.lp_dtype == torch.float8_e4m3fn
+            isinstance(model.fc, PatchedLinear)
+            and isinstance(model.conv1, PatchedConv2d)
+            and model.fc.quant_input.lp_dtype == torch.float8_e4m3fn
+            and model.conv1.quant_input.lp_dtype == torch.float8_e4m3fn
         ), "model is not quantized to torch.float8_e4m3fn."
         assert (fp32_out != fp8_out).any(), "FP32 output should be different with FP8 output"
diff --git a/test/3x/torch/quantization/weight_only/test_awq.py b/test/3x/torch/quantization/weight_only/test_awq.py
index 56927fb35f0..b830841fbff 100644
--- a/test/3x/torch/quantization/weight_only/test_awq.py
+++ b/test/3x/torch/quantization/weight_only/test_awq.py
@@ -30,6 +30,7 @@ def calib_func(model):
     for i in range(2):
         model(example_inputs)
 
+
 def get_woq_linear_num(model, woq_module_type_name):
     woq_linear_num = 0
     for _, module in model.named_modules():
diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index 6605132ff6f..cb896bb9764 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -5,5 +5,5 @@ numpy
 prettytable
 psutil
 pytest
-transformers
 torchvision
+transformers

From 7fbceafc5800e00e2b61862f817c0e75defb167f Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 6 Aug 2024 08:03:22 +0300
Subject: [PATCH 28/51] update readme demo

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 78dcefb3589..f2284f0a82d 100644
--- a/README.md
+++ b/README.md
@@ -98,10 +98,11 @@ model = convert(model)
 Following example code demonstrates Weight-Only Quantization on LLMs, it supports Intel CPU, Intel Gaudi2 AI Accelerator, Nvidia GPU, best device will be selected automatically. 
 
 ```python
+from transformers import AutoModelForCausalLM
 from neural_compressor.torch.quantization import prepare, convert, AutoRoundConfig
 
 model_name = "EleutherAI/gpt-neo-125m"
-model = AutoModel.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
 
 quant_config = AutoRoundConfig()
 model = prepare(model, quant_config)

From 3e5552ea6fe3e46f36033feceb59a895e3b74868 Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 6 Aug 2024 10:03:53 +0300
Subject: [PATCH 29/51] update WeightOnlyLinear to INCWeightOnlyLinear

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 .../torch/algorithms/layer_wise/utils.py      |  4 +-
 .../torch/algorithms/weight_only/awq.py       |  4 +-
 .../torch/algorithms/weight_only/modules.py   | 42 ++++++++++++++++++-
 .../torch/algorithms/weight_only/save_load.py |  1 +
 .../torch/algorithms/weight_only/teq.py       |  4 +-
 .../algorithms/weight_only/test_woq_module.py |  4 +-
 .../weight_only/test_autoround.py             | 14 +++----
 7 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/neural_compressor/torch/algorithms/layer_wise/utils.py b/neural_compressor/torch/algorithms/layer_wise/utils.py
index 49cd19efde2..daf0ad87060 100644
--- a/neural_compressor/torch/algorithms/layer_wise/utils.py
+++ b/neural_compressor/torch/algorithms/layer_wise/utils.py
@@ -27,7 +27,7 @@
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from neural_compressor.common import options
-from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
 
 from .load import load
 
@@ -320,7 +320,7 @@ def clean_module_weight(module):
     else:
         submodule = module
 
-    if isinstance(module, WeightOnlyLinear):
+    if isinstance(module, INCWeightOnlyLinear):
         for n, m in submodule._buffers.items():
             old_value = getattr(submodule, n)
             with torch.no_grad():
diff --git a/neural_compressor/torch/algorithms/weight_only/awq.py b/neural_compressor/torch/algorithms/weight_only/awq.py
index 63ae6b08564..00d7fb5172c 100644
--- a/neural_compressor/torch/algorithms/weight_only/awq.py
+++ b/neural_compressor/torch/algorithms/weight_only/awq.py
@@ -203,7 +203,7 @@ def quantize(self, use_auto_scale=True, use_mse_search=True, folding=False, retu
             use_mse_search (bool, optional): whether search clip range. Defaults to True.
             folding (bool, optional): whether only allow update scale when it can be fold
                                       to upper layer. Defaults to False.
-            return_int (bool, optional): whether return int dtype with WeightOnlyLinear.
+            return_int (bool, optional): whether return int dtype with INCWeightOnlyLinear.
                                          Defaults to False.
 
         Returns:
@@ -472,7 +472,7 @@ def apply_quantize_with_clip(self, return_int=False):
         """Quantize model with clip range.
 
         Args:
-            return_int (bool, optional): whether return int dtype with WeightOnlyLinear.
+            return_int (bool, optional): whether return int dtype with INCWeightOnlyLinear.
                                          Defaults to False.
         """
         # apply quantization and clip
diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index dba3b003604..e56b302ecb6 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -53,10 +53,13 @@ def forward(self, X):
 
 
 class UnpackedWeightOnlyLinearParams(dict):
+    """Contains all unpacked weight values."""
     def __init__(self, unpack_weight, scales, unpack_zp, **kwargs):
+        """Create dict."""
         super().__init__(int_weight=unpack_weight, scales=scales, zp=unpack_zp, **kwargs)
 
     def to(self, device):
+        """Change device for all values."""
         for key, value in self.items():
             if isinstance(value, torch.Tensor) and value is not None:
                 self[key] = value.to(device)
@@ -64,8 +67,7 @@ def to(self, device):
 
 
 class WeightOnlyLinear(torch.nn.Module):
-    """Weight Only Linear."""
-
+    """Base Weight Only Linear."""
     def __init__(
         self,
         in_features,
@@ -75,6 +77,7 @@ def __init__(
         group_size,
         device,
     ):
+        """Initialization."""
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -85,17 +88,21 @@ def __init__(
 
     @abstractmethod
     def pack(self, *args, **kwargs):
+        """Abstract method."""
         raise NotImplementedError("{} doesn't implement `pack` function. ".format(self.__class__.__name__))
 
     @abstractmethod
     def unpack(self, *args, **kwargs):
+        """Abstract method."""
         raise NotImplementedError("{} doesn't implement `unpack` function. ".format(self.__class__.__name__))
 
     @abstractmethod
     def forward(self, input):
+        """Abstract method."""
         raise NotImplementedError("{} doesn't implement `forward` function. ".format(self.__class__.__name__))
 
     def extra_repr(self) -> str:
+        """Show message about WeighOnlyLinear."""
         tmp_str = "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format(
             self.in_features,
             self.out_features,
@@ -107,6 +114,7 @@ def extra_repr(self) -> str:
 
 
 class INCWeightOnlyLinear(WeightOnlyLinear):
+    """INC Weight Only Linear."""
     def __init__(
         self,
         in_features,
@@ -816,6 +824,7 @@ def extra_repr(self) -> str:
 
 
 class HPUWeightOnlyLinear(WeightOnlyLinear):
+    """Weight Only Linear for HPU device."""
     def __init__(
         self,
         in_features,
@@ -833,6 +842,31 @@ def __init__(
         use_optimum_format=True,
         **kwargs,
     ):
+        """Init the WeightOnlyLinear object.
+
+        Args:
+            in_features (int): input features.
+            out_features (int): out features.
+            dtype (str, optional):  the data type of the quantized model. Defaults to "int".
+            bits (int, optional): number of bits for quantization. Defaults to 4.
+            group_size (int, optional): size of the quantization group. Defaults to 32.
+            zp (bool, optional): zero point. Defaults to False.
+            bias (bool, optional): module bias. Defaults to False.
+            scale_dtype (torch.Tensor, optional): the data type of quantization scale to be used.
+                                                  Defaults to torch.float32.
+            compression_dtype (torch.Tensor, optional): the target dtype after comoression.
+                                                        Defaults to torch.int32.
+            compression_dim (int, optional): select from [0, 1], 0 is output channel, 1 is input channel.
+                                             Defaults to 1.
+            g_idx (bool, optional): for recording the channel order.
+            device (str, optional): choose device for compression. Defaults to cpu.
+            use_optimum_format (bool, optional): use the popular huggingface compression format.
+                1: compression_dim: weight = 1, zeros = 0 and both are transposed.
+                2: zeros -= 1 before compression.
+                3: g_idx: use same number for one group instead of recording the channel order.
+                4. parameter name changed, such as 'packed_weight' -> 'qweight'.
+                5. zeros is always needed even for sym.
+        """
         super(HPUWeightOnlyLinear, self).__init__(
             in_features,
             out_features,
@@ -890,6 +924,7 @@ def __init__(
         self.wf = torch.tensor(list(range(0, 32, self.bits)), dtype=torch.int32).unsqueeze(0)
 
     def forward(self, input):
+        """The forward function of HPUWeighOnlyLinear."""
         input_dtype = input.dtype
         output_shape = input.shape[:-1] + (self.out_features,)
         scales = self.scales
@@ -904,6 +939,7 @@ def forward(self, input):
         return output
 
     def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
+        """Pack weight and zero point."""
         logger.debug("Packing for HPU")
 
         scales = scales.T.contiguous()
@@ -925,6 +961,7 @@ def pack(self, int_weight, scales, zp, bias=None, g_idx=None):
             self.bias = bias.to("hpu").to(torch.bfloat16)
 
     def unpack(self):
+        """Unpack weight and zero point."""
         logger.debug("Unpacking from HPU")
         self.qweight = self.qweight.cpu()
         weight = torch.bitwise_right_shift(
@@ -948,6 +985,7 @@ def unpack(self):
         return weight, zeros
 
     def pack_tensor(self, input, bits=4):
+        """Pack tensor."""
         normal = input.to(torch.int32)
         q = torch.zeros((normal.shape[0], normal.shape[1] // 32 * bits), dtype=torch.int32)
         i = 0
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
index 33e2cec187e..d515d91f9a7 100644
--- a/neural_compressor/torch/algorithms/weight_only/save_load.py
+++ b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -144,6 +144,7 @@ def load_woq_model(self):
         return model
 
     def load_inc_format_woq_model(self):
+        """Load WOQ model saved in INC file format."""
         self._model_local_dir = self.model_name_or_path
 
         qmodel_weight_file_path = os.path.join(
diff --git a/neural_compressor/torch/algorithms/weight_only/teq.py b/neural_compressor/torch/algorithms/weight_only/teq.py
index f97efcf4e99..b659cef4000 100644
--- a/neural_compressor/torch/algorithms/weight_only/teq.py
+++ b/neural_compressor/torch/algorithms/weight_only/teq.py
@@ -322,9 +322,9 @@ def quantize(self, **kwargs):
                     int_weight = int_weight.t_().contiguous()
                     scale = scale.t_().contiguous()
                     zp = zp.t_().contiguous() if zp is not None else zp
-                from .modules import WeightOnlyLinear
+                from .modules import INCWeightOnlyLinear
 
-                new_module = WeightOnlyLinear(
+                new_module = INCWeightOnlyLinear(
                     in_features,
                     out_features,
                     bits=num_bits,
diff --git a/test/3x/torch/algorithms/weight_only/test_woq_module.py b/test/3x/torch/algorithms/weight_only/test_woq_module.py
index 0f06f358beb..959e975b8c1 100644
--- a/test/3x/torch/algorithms/weight_only/test_woq_module.py
+++ b/test/3x/torch/algorithms/weight_only/test_woq_module.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
+from neural_compressor.torch.algorithms.weight_only.modules import INCWeightOnlyLinear
 from neural_compressor.torch.algorithms.weight_only.utility import quant_tensor
 
 
@@ -36,7 +36,7 @@ def test_pack_with_numba(self, bits, compression_dtype):
             return_int=True,
             group_size=32,
         )
-        new_module = WeightOnlyLinear(
+        new_module = INCWeightOnlyLinear(
             m.in_features,
             m.out_features,
             dtype=dtype,
diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 6d69ba75164..7f23af83f74 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -20,7 +20,7 @@
 
 try:
     import auto_round
-    from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
+    from auto_round.export.export_to_itrex.model_wrapper import INCWeightOnlyLinear
 
     auto_round_installed = True
 except ImportError:
@@ -76,9 +76,9 @@ def test_autoround(self, quant_lm_head):
         assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
         assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
         assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]
-        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, INCWeightOnlyLinear), "packing model failed."
         if quant_lm_head is True:
-            assert isinstance(q_model.lm_head, WeightOnlyLinear), "quantization for lm_head failed."
+            assert isinstance(q_model.lm_head, INCWeightOnlyLinear), "quantization for lm_head failed."
 
     def test_int4_dtype(self):
         fp32_model = copy.deepcopy(self.gptj)
@@ -95,7 +95,7 @@ def test_int4_dtype(self):
         assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
         assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
         assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]
-        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, INCWeightOnlyLinear), "packing model failed."
 
     def test_autoround_with_quantize_API(self):
         gpt_j_model = copy.deepcopy(self.gptj)
@@ -114,11 +114,11 @@ def test_autoround_with_quantize_API(self):
         )
         out = q_model(self.inp)[0]
         assert torch.allclose(out, self.label, atol=1e-1)
-        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, INCWeightOnlyLinear), "packing model failed."
 
     def test_save_and_load(self):
         fp32_model = copy.deepcopy(self.gptj)
-        # known issue: scale_dtype="fp32" will cause accuracy gap between quantized model (using auto-round WeightOnlyLinear) and reloaded model (using INCWeightOnlyLinear)
+        # known issue: scale_dtype="fp32" will cause accuracy gap between quantized model (using auto-round INCWeightOnlyLinear) and reloaded model (using INCWeightOnlyLinear)
         quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp16")
         # quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
         logger.info(f"Test AutoRound with config {quant_config}")
@@ -158,4 +158,4 @@ def test_conv1d(self):
         q_model = convert(model)
         out2 = q_model(**encoded_input)[0]
         assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
-        assert isinstance(q_model.h[0].attn.c_attn, WeightOnlyLinear), "loading compressed model failed."
+        assert isinstance(q_model.h[0].attn.c_attn, INCWeightOnlyLinear), "loading compressed model failed."

From e2003641d64a94d0520de620902ee8c9db44630f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 6 Aug 2024 07:05:43 +0000
Subject: [PATCH 30/51] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/torch/algorithms/weight_only/modules.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index e56b302ecb6..c6613f55f4d 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -54,6 +54,7 @@ def forward(self, X):
 
 class UnpackedWeightOnlyLinearParams(dict):
     """Contains all unpacked weight values."""
+
     def __init__(self, unpack_weight, scales, unpack_zp, **kwargs):
         """Create dict."""
         super().__init__(int_weight=unpack_weight, scales=scales, zp=unpack_zp, **kwargs)
@@ -68,6 +69,7 @@ def to(self, device):
 
 class WeightOnlyLinear(torch.nn.Module):
     """Base Weight Only Linear."""
+
     def __init__(
         self,
         in_features,
@@ -115,6 +117,7 @@ def extra_repr(self) -> str:
 
 class INCWeightOnlyLinear(WeightOnlyLinear):
     """INC Weight Only Linear."""
+
     def __init__(
         self,
         in_features,
@@ -825,6 +828,7 @@ def extra_repr(self) -> str:
 
 class HPUWeightOnlyLinear(WeightOnlyLinear):
     """Weight Only Linear for HPU device."""
+
     def __init__(
         self,
         in_features,

From ec45a270db903b932a824df14cfc8f14a1656621 Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 6 Aug 2024 10:33:39 +0300
Subject: [PATCH 31/51] add docstring for FP8Config

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 .../scripts/codeScan/pydocstyle/scan_path.txt |  1 +
 .../torch/quantization/config.py              | 43 ++++++++++++++++++-
 .../torch/quantization/quantize.py            | 11 +----
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
index 317a74f7d2f..ed2c4ccafca 100644
--- a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
+++ b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
@@ -25,4 +25,5 @@
 /neural-compressor/neural_compressor/torch/algorithms/static_quant
 /neural-compressor/neural_compressor/torch/algorithms/weight_only
 /neural-compressor/neural_compressor/torch/export
+/neural-compressor/neural_compressor/torch/quantization
 /neural-compressor/neural_compressor/torch/utils
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index f58c5b60e3b..95af0e07509 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -1337,6 +1337,7 @@ def __init__(
             excluded_precisions (list): Precisions to be excluded, Default value is empty list.
             white_list (Optional[List[OP_NAME_OR_MODULE_TYPE]]): White list of operator names or module types.
                                                                  Default is DEFAULT_WHITE_LIST.
+            model_info (Optional): used to keep model info for XPU device.  # TODO: should be removed from input arguments
         """
         super().__init__(white_list=white_list)
         self.w_dtype = w_dtype
@@ -1584,6 +1585,7 @@ def get_model_info(self, model: torch.nn.Module, example_inputs) -> List[Tuple[s
 
     @classmethod
     def get_config_set_for_tuning(cls) -> Union[None, "SmoothQuantConfig", List["SmoothQuantConfig"]]:
+        """Get the default configuration set for tuning."""
         import numpy as np
 
         return SmoothQuantConfig(
@@ -1771,7 +1773,21 @@ def __init__(
         measure_exclude: str = "OUTPUT",
         **kwargs,
     ):
-        """Init FP8 config."""
+        """Initializing FP8Config.
+
+        Args:
+            dump_stats_path (str, optional): The file folder and file prefix to save measurement info. Defaults to "./hqt_output/measure".
+            fp8_config (str, optional): The data type of fp8. Defaults to "E4M3".
+            hp_dtype (str, optional): The hight precision data type used in fp8 quantization. Defaults to "bf16".
+            blocklist (dict, optional): whether to skip fp8 quantization for specific op names or types, name could be substring. Defaults to {"names": [], "types": ()}.
+            allowlist (dict, optional): whether to execute fp8 quantization for specific op names or types. Defaults to {"names": [], "types": FP8_WHITE_LIST}.
+            mode (str, optional): Choose the quantization mode. Defaults to "AUTO".
+            scale_method (str, optional): Select method used to generate scale from calibration info. Defaults to "maxabs_hw".
+            scale_params (dict, optional): _description_. Defaults to {}.
+            observer (str, optional): Params of scales. Defaults to "maxabs".
+            mod_dict (dict, optional): The dict of modules to quantize. Defaults to {}.
+            measure_exclude (str, optional): Select INPUT/OUTPUT to be exculded by measurement. Defaults to "OUTPUT".
+        """
         super().__init__()
         self.dump_stats_path = dump_stats_path
         self.fp8_config = fp8_config
@@ -1787,22 +1803,27 @@ def __init__(
 
     @property
     def measure(self):
+        """Check whether the mode is for measurement."""
         return self.mode == "MEASURE"
 
     @property
     def quantize(self):
+        """Check whether the mode is for quantization."""
         return self.mode == "QUANTIZE"
 
     @property
     def json_file(self):
+        """Get the path of json file."""
         return self._json_file
 
     @json_file.setter
     def json_file(self, json_file):
+        """Set the path of json file."""
         self._json_file = json_file
 
     @classmethod
     def from_json_file(cls, filename):
+        """Set configuration from json file."""
         with open(filename, "r", encoding="utf-8") as file:
             config_dict = json.load(file)
         config = cls.from_dict(config_dict)
@@ -1810,6 +1831,7 @@ def from_json_file(cls, filename):
         return config
 
     def save_temp_json_file(self):
+        """Save configuration to a temporary json file."""
         import tempfile
         from pathlib import Path
 
@@ -1819,6 +1841,7 @@ def save_temp_json_file(self):
 
     @classmethod
     def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]:
+        """Get the configuration set for tuning."""
         # just a simple example here
         # usually write parameter combinations that are more suitable to tune based on experience.
         return FP8Config(
@@ -1857,6 +1880,14 @@ def register_supported_configs(cls):
 
     @staticmethod
     def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
+        """Get information about the model.
+
+        Args:
+            model (torch.nn.Module): The model.
+
+        Returns:
+            List[Tuple[str, Callable]]: List of tuples containing the name and type of each module in the model.
+        """
         filter_result = []
         for op_name, module in model.named_modules():
             if (
@@ -1869,6 +1900,16 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
         return filter_result
 
     def to_config_mapping(self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None):
+        """Convert the configuration to a mapping.
+
+        Args:
+            config_list (List[BaseConfig]): List of base configurations. Default is None.
+            model_info (List[Tuple[str, str]]): List of tuples containing the name and type of each module in the model.
+                Default is None.
+
+        Returns:
+            OrderedDictType[Union[str, str], OrderedDictType[str, BaseConfig]]: The configuration mapping.
+        """
         if self.json_file is None:
             self.save_temp_json_file()
         config_mapping = OrderedDict()
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 21ed5b8f13f..8200297cf24 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -28,15 +28,7 @@
 
 
 def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name):
-    """Check whether to apply this algorithm according to configs_mapping.
-
-    Args:
-        configs_mapping (Dict[Tuple[str, callable], BaseConfig]): configs mapping
-        algo_name (str): algo name
-
-    Returns:
-        Bool: True or False.
-    """
+    """Check whether to apply algorithm based on config_mapping."""
     return any(config.name == algo_name for config in configs_mapping.values())
 
 
@@ -233,6 +225,7 @@ def convert(
 
 
 def finalize_calibration(model):
+    """Generate and save calibration info."""
     from neural_compressor.torch.algorithms.fp8_quant import save_calib_result
 
     save_calib_result(model)

From ab212c87ba401903bf577b85eea63dd2cdc99eea Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Tue, 6 Aug 2024 11:05:38 +0300
Subject: [PATCH 32/51] fix pylint

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 .../torch/algorithms/weight_only/modules.py   | 32 -------------------
 .../torch/quantization/quantize.py            | 10 +++++-
 2 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index c6613f55f4d..05091b4dbcd 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -388,38 +388,6 @@ def recover(self):
 
         return fp32_weight.to(scales.device)
 
-    def forward(self, input):
-        """Forward function."""
-        if not hasattr(self, "weight"):
-            weight = self.recover()
-            device = self.scales.device
-            if weight.dtype == torch.float16 and device.type == "cpu":
-                weight = weight.float()
-                self.bias = self.bias.float() if self.bias is not None else None
-        if True:  # keep reusing self.weight due to recover is too slow.
-            if not hasattr(self, "weight"):
-                self.weight = weight
-            input = input.type(self.weight.dtype)
-            logger.debug(f"Calculating {self}")
-            return F.linear(input, self.weight, self.bias)
-        else:
-            input = input.type(weight.dtype)
-            return F.linear(input, weight, self.bias)
-
-    def pack_tensor(self, raw_tensor):
-        """Pack tensor."""
-        if "cuda" in self.device:
-            return self.pack_tensor_with_torch(raw_tensor)
-        else:
-            return self.pack_tensor_with_numpy(raw_tensor)
-
-    def unpack_tensor(self, packed_tensor):
-        """Unpack tensor."""
-        if "cuda" in self.device:
-            return self.unpack_tensor_with_torch(packed_tensor)
-        else:
-            return self.unpack_tensor_with_numpy(packed_tensor)
-
     def pack_tensor_with_torch(self, raw_tensor):
         """Pack the tensor with torch.
 
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 8200297cf24..493dcb1d90e 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -28,7 +28,15 @@
 
 
 def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name):
-    """Check whether to apply algorithm based on config_mapping."""
+    """Check whether to apply this algorithm according to configs_mapping.
+
+    Args:
+        configs_mapping (Dict[Tuple[str, callable], BaseConfig]): configs mapping
+        algo_name (str): algo name
+
+    Returns:
+        Bool: True or False.
+    """
     return any(config.name == algo_name for config in configs_mapping.values())
 
 

From d42660a617f1eddf9c4f11952da721172ed506c8 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Wed, 7 Aug 2024 16:56:26 +0800
Subject: [PATCH 33/51] update fp8 test scripts

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8 | 3 +--
 .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh   | 5 +++++
 .azure-pipelines/ut-3x-pt-fp8.yml                 | 6 ++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8 b/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
index f1bf27d8da3..9b12b354d83 100644
--- a/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
+++ b/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
@@ -3,8 +3,7 @@ branch = True
 
 [report]
 include =
- */neural_compressor/torch/algorithms/habana_fp8/*
- */neural_compressor/torch/amp/*
+ */neural_compressor/torch/algorithms/fp8_quant/*
 exclude_lines =
  pragma: no cover
  raise NotImplementedError
diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
index d2aef0c3045..d67c4b521a2 100644
--- a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
+++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
@@ -10,6 +10,7 @@ pip install -r /neural-compressor/test/3x/torch/requirements.txt
 pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
 pip install pytest-cov
 pip install pytest-html
+pip install pytest-html-merger
 pip list
 
 export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
@@ -20,7 +21,11 @@ LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html torch/algorithms/fp8_quant 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
 
+mkdir -p report && mv *.html report
+pytest_html_merger -i ./report -o ./report.html
 cp report.html ${LOG_DIR}/
 
 if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
diff --git a/.azure-pipelines/ut-3x-pt-fp8.yml b/.azure-pipelines/ut-3x-pt-fp8.yml
index 490db6db3be..e8a992b6e65 100644
--- a/.azure-pipelines/ut-3x-pt-fp8.yml
+++ b/.azure-pipelines/ut-3x-pt-fp8.yml
@@ -10,6 +10,12 @@ pr:
     include:
       - .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
       - .azure-pipelines/ut-3x-pt-fp8.yml
+      - neural_compressor/common
+      - neural_compressor/torch
+      - test/3x/torch/algorithms/fp8_quant
+      - test/3x/torch/quantization/fp8_quant
+      - setup.py
+      - requirements_pt.txt
 
 pool: GAUDI
 

From 1b01e6132c7c6225655cdd149a6f88c26511d8fe Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Thu, 8 Aug 2024 10:53:52 +0800
Subject: [PATCH 34/51] delete deps

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 test/3x/torch/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index cb896bb9764..c17e22d6f77 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -5,5 +5,4 @@ numpy
 prettytable
 psutil
 pytest
-torchvision
 transformers

From 60b98f83722bcb74bf0ab99eb6d0846f79a6e317 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Thu, 8 Aug 2024 11:02:27 +0800
Subject: [PATCH 35/51] update container into v1.17.0

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/template/docker-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml
index 34c30734791..aff75a8316a 100644
--- a/.azure-pipelines/template/docker-template.yml
+++ b/.azure-pipelines/template/docker-template.yml
@@ -95,7 +95,7 @@ steps:
             else
                 docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
                 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
-                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
+                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
             fi
             echo "Show the container list after docker run ... "
             docker ps -a

From 8437a6564599d57bbba4d14eb4ca421b3923284a Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Thu, 8 Aug 2024 06:06:27 +0300
Subject: [PATCH 36/51] update docker version

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 .azure-pipelines/template/docker-template.yml   | 2 +-
 README.md                                       | 2 +-
 examples/3.x_api/pytorch/cv/fp8_quant/README.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml
index aff75a8316a..51103c39e21 100644
--- a/.azure-pipelines/template/docker-template.yml
+++ b/.azure-pipelines/template/docker-template.yml
@@ -74,7 +74,7 @@ steps:
 
   - ${{ if eq(parameters.imageSource, 'pull') }}:
       - script: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
+            docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
         displayName: "Pull habana docker image"
 
   - script: |
diff --git a/README.md b/README.md
index 63654958e1e..f579418b0e5 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ Following example code demonstrates FP8 Quantization, it is supported by Intel G
 To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
 ```bash
 # Run a container with an interactive shell
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
 ```
 Run the example:
 ```python
diff --git a/examples/3.x_api/pytorch/cv/fp8_quant/README.md b/examples/3.x_api/pytorch/cv/fp8_quant/README.md
index df462c3b7b2..72b8eb282b5 100644
--- a/examples/3.x_api/pytorch/cv/fp8_quant/README.md
+++ b/examples/3.x_api/pytorch/cv/fp8_quant/README.md
@@ -7,7 +7,7 @@ This implements FP8 quantization of popular model architectures, such as ResNet
 To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built). 
 ```bash
 # Run a container with an interactive shell
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
 ```
 
 - Install requirements

From 7fadea98fc7476f672d5bc9af33e9a835b12dc4e Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Thu, 8 Aug 2024 17:29:03 +0800
Subject: [PATCH 37/51] update pt ut

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/ut/3x/coverage.3x_pt | 2 +-
 .azure-pipelines/scripts/ut/3x/run_3x_pt.sh   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/3x/coverage.3x_pt b/.azure-pipelines/scripts/ut/3x/coverage.3x_pt
index 2902c0c8f9c..dd4991f5fa7 100644
--- a/.azure-pipelines/scripts/ut/3x/coverage.3x_pt
+++ b/.azure-pipelines/scripts/ut/3x/coverage.3x_pt
@@ -6,7 +6,7 @@ include =
  */neural_compressor/common/*
  */neural_compressor/torch/*
 omit =
- */neural_compressor/torch/algorithms/habana_fp8/*
+ */neural_compressor/torch/algorithms/fp8_quant/*
  */neural_compressor/torch/amp/*
 exclude_lines =
  pragma: no cover
diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh
index ce36d3d8bc3..fba15ce6c4e 100644
--- a/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh
+++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt.sh
@@ -15,8 +15,8 @@ export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverag
 inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
 cd /neural-compressor/test/3x || exit 1
 rm -rf tensorflow
-rm -rf onnxrt
 rm -rf torch/algorithms/fp8_quant
+rm -rf torch/quantization/fp8_quant
 
 LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}

From 5a055c131078c740ad14744b6dd21f60d1bc396d Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 9 Aug 2024 09:34:37 +0800
Subject: [PATCH 38/51] add lib path

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
index d67c4b521a2..390d4e09f3e 100644
--- a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
+++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
@@ -5,6 +5,7 @@ echo "${test_case}"
 
 # install requirements
 echo "set up UT env..."
+export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requirements.txt
 pip install -r /neural-compressor/test/3x/torch/requirements.txt
 pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0

From 45406dbb14287a7c14f8c94aff86c925afe69bd8 Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Fri, 9 Aug 2024 05:16:22 +0300
Subject: [PATCH 39/51] fix dir issue

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 .../quantization/fp8_quant/test_fp8_static_quant.py      | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
index 63678521d11..ec7f22e6e23 100644
--- a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
+++ b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
@@ -18,6 +18,13 @@
 from neural_compressor.torch.utils import is_hpex_available
 
 
+def change_to_cur_file_dir():
+    import os
+    current_file_path = os.path.abspath(__file__)
+    current_directory = os.path.dirname(current_file_path)
+    os.chdir(current_directory)
+
+
 @torch.no_grad()
 def calib_func(model):
     example_inputs = torch.tensor([[10, 20, 30, 40, 50, 60]], dtype=torch.long).to("hpu")
@@ -80,6 +87,7 @@ def test_one_step_quant_cv(self):
         assert (fp32_out != fp8_out).any(), "FP32 output should be different with FP8 output"
 
     def test_two_step_quant_nlp(self):
+        change_to_cur_file_dir()
         # step 1: measurement
         model = copy.deepcopy(self.tiny_gptj)
         config = FP8Config.from_json_file("test_fp8_jsons/test_measure.json")
@@ -97,6 +105,7 @@ def test_two_step_quant_nlp(self):
         ), "k_proj input dtype is not torch.float8_e4m3fn."
 
     def test_two_step_quant_cv(self):
+        change_to_cur_file_dir()
         # step 1: measurement
         model = copy.deepcopy(self.resnet18)
         config = FP8Config.from_json_file("test_fp8_jsons/test_measure.json")

From cb4735be8f3783232acd11c4c41c6f4806235fef Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 9 Aug 2024 02:17:50 +0000
Subject: [PATCH 40/51] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
index ec7f22e6e23..6a42734d23c 100644
--- a/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
+++ b/test/3x/torch/quantization/fp8_quant/test_fp8_static_quant.py
@@ -20,6 +20,7 @@
 
 def change_to_cur_file_dir():
     import os
+
     current_file_path = os.path.abspath(__file__)
     current_directory = os.path.dirname(current_file_path)
     os.chdir(current_directory)

From 62addf2fc82c571eeac710f86fb2d6f657d1a425 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 9 Aug 2024 13:54:57 +0800
Subject: [PATCH 41/51] update fp8 test scope

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
index 390d4e09f3e..46c174b3050 100644
--- a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
+++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
@@ -22,8 +22,9 @@ LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html torch/algorithms/fp8_quant 2>&1 | tee -a ${ut_log_name}
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only 2>&1 | tee -a ${ut_log_name}
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only/test_load.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
 
 mkdir -p report && mv *.html report
 pytest_html_merger -i ./report -o ./report.html

From 0df24a6e30ca63417dcec4216938a7416b77ad25 Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Fri, 9 Aug 2024 09:51:36 +0300
Subject: [PATCH 42/51] fix typo

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 neural_compressor/torch/quantization/config.py     | 2 +-
 test/3x/torch/quantization/weight_only/test_rtn.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 95af0e07509..a216faa6f96 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -1778,7 +1778,7 @@ def __init__(
         Args:
             dump_stats_path (str, optional): The file folder and file prefix to save measurement info. Defaults to "./hqt_output/measure".
             fp8_config (str, optional): The data type of fp8. Defaults to "E4M3".
-            hp_dtype (str, optional): The hight precision data type used in fp8 quantization. Defaults to "bf16".
+            hp_dtype (str, optional): The high precision data type used in fp8 quantization. Defaults to "bf16".
             blocklist (dict, optional): whether to skip fp8 quantization for specific op names or types, name could be substring. Defaults to {"names": [], "types": ()}.
             allowlist (dict, optional): whether to execute fp8 quantization for specific op names or types. Defaults to {"names": [], "types": FP8_WHITE_LIST}.
             mode (str, optional): Choose the quantization mode. Defaults to "AUTO".
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index f0c2cf56ae6..625cbc8cb03 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -84,7 +84,7 @@ def test_int_params(self, bits, use_sym, group_size, group_dim):
         out = model(self.example_inputs)[0]
         assert (out != self.label).any(), "WOQ output should be different with raw output"
         if is_hpex_available():
-            assert "hpu" in out.device, "Neural Compressor should run on HPU when HPEX is available."
+            assert "hpu" in out.device.type, "Neural Compressor should run on HPU when HPEX is available."
         if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1):
             assert torch.allclose(out, self.label, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
         if (bits, use_sym, group_size, group_dim) == [(4, True, 128, 0), (4, True, 32, 1)]:

From 93e4aa0f26c7f41cb523b53bc53d4fb06d5bd7db Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 9 Aug 2024 13:54:57 +0800
Subject: [PATCH 43/51] update fp8 test scope

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
index 390d4e09f3e..c4b6efd5373 100644
--- a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
+++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
@@ -21,9 +21,10 @@ cd /neural-compressor/test/3x || exit 1
 LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html torch/algorithms/fp8_quant 2>&1 | tee -a ${ut_log_name}
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only 2>&1 | tee -a ${ut_log_name}
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only/test_load.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_4.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
 
 mkdir -p report && mv *.html report
 pytest_html_merger -i ./report -o ./report.html

From f0872988b3cb9d053eb93354a52d7128d1125bf2 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 9 Aug 2024 15:32:07 +0800
Subject: [PATCH 44/51] update pre-commit-ci

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .pre-commit-config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5e77a67f9f1..d93d64aba33 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -128,7 +128,8 @@ repos:
               examples/.*(txt|patch)|
               examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/prompt.json|
               examples/notebook/dynas/ResNet50_Quantiation_Search_Supernet_NAS.ipynb|
-              examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb
+              examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb|
+              neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
           )$
 
   - repo: https://github.com/astral-sh/ruff-pre-commit

From a858babbbfb6592d561184e103a6a83013bc191e Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Fri, 9 Aug 2024 11:18:24 +0300
Subject: [PATCH 45/51] work around for hpu

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 neural_compressor/torch/algorithms/weight_only/modules.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/torch/algorithms/weight_only/modules.py b/neural_compressor/torch/algorithms/weight_only/modules.py
index 05091b4dbcd..e5aca5e2237 100644
--- a/neural_compressor/torch/algorithms/weight_only/modules.py
+++ b/neural_compressor/torch/algorithms/weight_only/modules.py
@@ -420,7 +420,7 @@ def unpack_tensor_with_torch(self, packed_tensor):
         Returns:
             tensor: unpacked tensor.
         """
-        target_dtype = torch.int8 if not hasattr(self, "qzeros") or "int" not in self.dtype else torch.uint8
+        target_dtype = torch.int16
         target_len = packed_tensor.shape[1] * self.n_pack
         unpacked_tensor = torch.zeros(packed_tensor.shape[0], target_len, dtype=target_dtype).to(packed_tensor.device)
         mask = torch.tensor(2**self.bits - 1, dtype=self.compression_dtype).to(packed_tensor.device)
@@ -430,7 +430,7 @@ def unpack_tensor_with_torch(self, packed_tensor):
                 tmp = packed_tensor[:, j]
                 tmp = tmp << (self.compress_bits - self.bits * (e + 1))
                 tmp = tmp >> self.compress_bits - self.bits
-                if target_dtype == torch.uint8:
+                if hasattr(self, "qzeros"):
                     tmp &= mask  # remove sign bit
                 unpacked_tensor[:, index].copy_(tmp.type(target_dtype))
                 accelerator.synchronize()
@@ -746,14 +746,14 @@ def unpack_tensor_with_numpy(self, packed_tensor):
 
     def pack_tensor(self, raw_tensor):
         """Pack tensor."""
-        if "cuda" in raw_tensor.device.type:
+        if "cuda" in raw_tensor.device.type or "hpu" in raw_tensor.device.type:
             return self.pack_tensor_with_torch(raw_tensor)
         else:
             return self.pack_tensor_with_numpy(raw_tensor)
 
     def unpack_tensor(self, packed_tensor):
         """Unpack tensor."""
-        if "cuda" in packed_tensor.device.type:
+        if "cuda" in packed_tensor.device.type or "hpu" in packed_tensor.device.type:
             return self.unpack_tensor_with_torch(packed_tensor)
         else:
             return self.unpack_tensor_with_numpy(packed_tensor)

From 9ab10e624e3deeadf63feb67c7a856dfbf43961a Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Fri, 9 Aug 2024 11:54:03 +0300
Subject: [PATCH 46/51] fix UT

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 .../quantization/weight_only/test_autoround.py   | 14 +++++++-------
 .../torch/quantization/weight_only/test_rtn.py   | 16 ++++++++++------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 7f23af83f74..6d69ba75164 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -20,7 +20,7 @@
 
 try:
     import auto_round
-    from auto_round.export.export_to_itrex.model_wrapper import INCWeightOnlyLinear
+    from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear
 
     auto_round_installed = True
 except ImportError:
@@ -76,9 +76,9 @@ def test_autoround(self, quant_lm_head):
         assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
         assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
         assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]
-        assert isinstance(q_model.transformer.h[0].attn.k_proj, INCWeightOnlyLinear), "packing model failed."
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
         if quant_lm_head is True:
-            assert isinstance(q_model.lm_head, INCWeightOnlyLinear), "quantization for lm_head failed."
+            assert isinstance(q_model.lm_head, WeightOnlyLinear), "quantization for lm_head failed."
 
     def test_int4_dtype(self):
         fp32_model = copy.deepcopy(self.gptj)
@@ -95,7 +95,7 @@ def test_int4_dtype(self):
         assert "transformer.h.0.attn.k_proj" in q_model.autoround_config.keys()
         assert "scale" in q_model.autoround_config["transformer.h.0.attn.k_proj"].keys()
         assert torch.float32 == q_model.autoround_config["transformer.h.0.attn.k_proj"]["scale_dtype"]
-        assert isinstance(q_model.transformer.h[0].attn.k_proj, INCWeightOnlyLinear), "packing model failed."
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
 
     def test_autoround_with_quantize_API(self):
         gpt_j_model = copy.deepcopy(self.gptj)
@@ -114,11 +114,11 @@ def test_autoround_with_quantize_API(self):
         )
         out = q_model(self.inp)[0]
         assert torch.allclose(out, self.label, atol=1e-1)
-        assert isinstance(q_model.transformer.h[0].attn.k_proj, INCWeightOnlyLinear), "packing model failed."
+        assert isinstance(q_model.transformer.h[0].attn.k_proj, WeightOnlyLinear), "packing model failed."
 
     def test_save_and_load(self):
         fp32_model = copy.deepcopy(self.gptj)
-        # known issue: scale_dtype="fp32" will cause accuracy gap between quantized model (using auto-round INCWeightOnlyLinear) and reloaded model (using INCWeightOnlyLinear)
+        # known issue: scale_dtype="fp32" will cause accuracy gap between quantized model (using auto-round WeightOnlyLinear) and reloaded model (using INCWeightOnlyLinear)
         quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp16")
         # quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
         logger.info(f"Test AutoRound with config {quant_config}")
@@ -158,4 +158,4 @@ def test_conv1d(self):
         q_model = convert(model)
         out2 = q_model(**encoded_input)[0]
         assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
-        assert isinstance(q_model.h[0].attn.c_attn, INCWeightOnlyLinear), "loading compressed model failed."
+        assert isinstance(q_model.h[0].attn.c_attn, WeightOnlyLinear), "loading compressed model failed."
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index 625cbc8cb03..d63672539ae 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -192,7 +192,7 @@ def test_layer_wise(self):
     def test_dtype_params(self, dtype):
         if dtype in ["fp8_e5m2", "fp8_e5m2fnuz", "fp8_e4m3fn", "fp8_e4m3fnuz"]:
             full_dtype_name = dtype.replace("fp8", "float8")
-            if not hasattr(torch, full_dtype_name):
+            if not hasattr(torch, full_dtype_name) or "hpu" in device:
                 return  # for low torch version
         model = copy.deepcopy(self.tiny_gptj)
         quant_config = RTNConfig(
@@ -325,7 +325,8 @@ def test_conv1d(self, bits, use_sym, group_size, group_dim):
         assert (out2 != out1).any(), "WOQ out2put should be different with raw output"
         if (bits, use_sym, group_size, group_dim) == (8, True, -1, 1):
             if "hpu" in device:
-                assert torch.allclose(out2, out1, atol=0.15), "Accuracy gap atol > 0.15 is unexpected."
+                # out2 is float16, no idea.
+                assert torch.allclose(out2.float(), out1.float(), atol=0.15), "Accuracy gap atol > 0.15 is unexpected."
             else:
                 assert torch.allclose(out2, out1, atol=0.01), "Accuracy gap atol > 0.01 is unexpected."
         if (bits, use_sym, group_size, group_dim) == [(4, True, 128, 0), (4, True, 32, 1)]:
@@ -347,9 +348,12 @@ def test_save_and_load(self):
         # linear -> INCWeightOnlyLinear
         loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj))
         output = loaded_model(self.example_inputs)[0]
-        assert torch.allclose(inc_out, output), "Unexpected result. Please double check."
+        if "hpu" in device:
+            assert torch.allclose(inc_out, output, atol=0.001), "Unexpected result. Please double check."
+        else:
+            assert torch.allclose(inc_out, output), "Unexpected result. Please double check."
         assert (
-            get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 31
+            get_woq_linear_num(loaded_model, "INCWeightOnlyLinear") == 30
         ), "Incorrect number of INCWeightOnlyLinear modules"
 
     @pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.")
@@ -367,14 +371,14 @@ def test_save_and_load_hpu(self):
         # first load: linear -> INCWeightOnlyLinear -> HPUWeightOnlyLinear, save quantized_hpu_weight.pt to local cache dir
         loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj), device="hpu")
         assert (
-            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 30
         ), "Incorrect number of HPUWeightOnlyLinear modules"
         output1 = loaded_model(self.example_inputs)[0]
 
         # second load: linear -> HPUWeightOnlyLinear using quantized_hpu_weight.pt saved in local cache dir
         loaded_model = load("saved_results", copy.deepcopy(self.tiny_gptj), device="hpu")
         assert (
-            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 31
+            get_woq_linear_num(loaded_model, "HPUWeightOnlyLinear") == 30
         ), "Incorrect number of HPUWeightOnlyLinear modules"
         output2 = loaded_model(self.example_inputs)[0]
 

From 6642962ca32dc2f9916d822ec71c3eedac3bbd6c Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 9 Aug 2024 20:41:36 +0800
Subject: [PATCH 47/51] fix parameter

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 test/3x/torch/quantization/weight_only/test_autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py
index 6d69ba75164..88cae7e9384 100644
--- a/test/3x/torch/quantization/weight_only/test_autoround.py
+++ b/test/3x/torch/quantization/weight_only/test_autoround.py
@@ -119,7 +119,7 @@ def test_autoround_with_quantize_API(self):
     def test_save_and_load(self):
         fp32_model = copy.deepcopy(self.gptj)
         # known issue: scale_dtype="fp32" will cause accuracy gap between quantized model (using auto-round WeightOnlyLinear) and reloaded model (using INCWeightOnlyLinear)
-        quant_config = AutoRoundConfig(n_samples=32, seqlen=10, iters=10, scale_dtype="fp16")
+        quant_config = AutoRoundConfig(nsamples=32, seqlen=10, iters=10, scale_dtype="fp16")
         # quant_config.set_local("lm_head", AutoRoundConfig(dtype="fp32"))
         logger.info(f"Test AutoRound with config {quant_config}")
 

From 02d490ee56648b0d113c4773bcef8a8f54337e31 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 9 Aug 2024 22:04:54 +0800
Subject: [PATCH 48/51] omit some test

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
index c4b6efd5373..753dd8ac440 100644
--- a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
+++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
@@ -23,7 +23,7 @@ mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only/test_load.py 2>&1 | tee -a ${ut_log_name}
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
+# pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_4.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
 
 mkdir -p report && mv *.html report

From 5e023210a01b57bd0fb5cc7ff18d820da24a44ca Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Fri, 9 Aug 2024 17:30:08 +0300
Subject: [PATCH 49/51] update main page example to llm loading

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 README.md | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index f579418b0e5..6d1b5e9a7a7 100644
--- a/README.md
+++ b/README.md
@@ -93,27 +93,24 @@ calib_func(model)
 model = convert(model)
 ```
 
-### [Weight-Only Quantization (LLMs)](./examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only/)
+### Weight-Only Large Language Model Loading (LLMs)
 
-Following example code demonstrates Weight-Only Quantization on LLMs, it supports Intel CPU, Intel Gaudi2 AI Accelerator, Nvidia GPU, best device will be selected automatically. 
+Following example code demonstrates weight-only large language model loading on Intel Gaudi2 AI Accelerator. 
 
 ```python
-from transformers import AutoModelForCausalLM
-from neural_compressor.torch.quantization import prepare, convert, AutoRoundConfig
-
-model_name = "EleutherAI/gpt-neo-125m"
-model = AutoModelForCausalLM.from_pretrained(model_name)
-
-quant_config = AutoRoundConfig()
-model = prepare(model, quant_config)
-# customer defined calibration
-run_fn(model)  # calibration
-model = convert(model)
+from neural_compressor.torch.quantization import load
+model_name = "TheBloke/Llama-2-7B-GPTQ"
+model = load(
+    model_name_or_path=model_name,
+    format="huggingface",
+    device="hpu",
+    torch_dtype=torch.bfloat16,
+)
 ```
 
 **Note:**
 
-To try INT4 model inference, please directly use [Intel Extension for Transformers](https://github.com/intel/intel-extension-for-transformers), which leverages Intel Neural Compressor for model quantization.
+Intel Neural Compressor will convert the model format from auto-gptq to hpu format on the first load and save hpu_model.safetensors to the local cache directory for the next load. So it may take a while to load for the first time.
 
 ## Documentation
 

From 084e24418bac93348a9cf01af14b90ff7cd8360f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 9 Aug 2024 14:32:04 +0000
Subject: [PATCH 50/51] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6d1b5e9a7a7..d76126265d9 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ Following example code demonstrates weight-only large language model loading on
 
 ```python
 from neural_compressor.torch.quantization import load
+
 model_name = "TheBloke/Llama-2-7B-GPTQ"
 model = load(
     model_name_or_path=model_name,

From 376372370ee7b6fd48a252b0d8221ee88a329d0d Mon Sep 17 00:00:00 2001
From: xinhe3 <xinhe3@hababa.ai>
Date: Sat, 10 Aug 2024 08:37:46 +0300
Subject: [PATCH 51/51] fix autotune

Signed-off-by: xinhe3 <xinhe3@hababa.ai>
---
 neural_compressor/torch/quantization/config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index a216faa6f96..2973dc48a51 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -1755,7 +1755,6 @@ class FP8Config(TorchBaseConfig):
         "fp8_config",
         "scale_method",
         "observer",
-        "measure_exclude",
     ]
 
     def __init__(