Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance 3.x torch algorithm entry #1779

Merged
merged 8 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions neural_compressor/torch/algorithms/weight_only/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
class AutoRoundQuantizer(Quantizer):
def __init__(
self,
weight_config: dict = {},
quant_config: dict = {},
yiliu30 marked this conversation as resolved.
Show resolved Hide resolved
enable_full_range: bool = False,
batch_size: int = 8,
amp: bool = True,
Expand All @@ -51,8 +51,8 @@ def __init__(
"""Init a AutQRoundQuantizer object.

Args:
weight_config (dict): Configuration for weight quantization (default is an empty dictionary).
weight_config={
quant_config (dict): Configuration for weight quantization (default is an empty dictionary).
quant_config={
'layer1':##layer_name
{
'data_type': 'int',
Expand Down Expand Up @@ -89,9 +89,8 @@ def __init__(
scale_dtype (str): The data type of quantization scale to be used (default is "float32"), different kernels
have different choices.
"""
super().__init__(weight_config)
super().__init__(quant_config)
self.tokenizer = None
self.weight_config = weight_config
self.enable_full_range = enable_full_range
self.batch_size = batch_size
self.amp = amp
Expand Down Expand Up @@ -125,7 +124,7 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs):
self.rounder = AutoRoundProcessor(
model=model,
tokenizer=None,
weight_config=self.weight_config,
weight_config=self.quant_config,
enable_full_range=self.enable_full_range,
batch_size=self.batch_size,
amp=self.amp,
Expand Down
124 changes: 49 additions & 75 deletions neural_compressor/torch/quantization/algorithm_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
StaticQuantConfig,
TEQConfig,
)
from neural_compressor.torch.utils import Mode, logger, register_algo
from neural_compressor.torch.utils import Mode, logger, postprocess_model, preprocess_quantizer, register_algo


###################### RTN Algo Entry ##################################
Expand Down Expand Up @@ -68,17 +68,9 @@ def rtn_entry(
"double_quant_group_size": quant_config.double_quant_group_size,
}

if getattr(model, "quantizer", False):
quantizer = model.quantizer
else:
quantizer = RTNQuantizer(quant_config=weight_config)

quantizer = preprocess_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config)
model = quantizer.execute(model, mode=mode)

if getattr(model, "quantizer", False):
del model.quantizer
else:
model.quantizer = quantizer
postprocess_model(model, mode, quantizer)
return model


Expand Down Expand Up @@ -125,15 +117,11 @@ def gptq_entry(
)
kwargs.pop("example_inputs")
logger.warning("lm_head in transformer model is skipped by GPTQ")
if getattr(model, "quantizer", False):
quantizer = model.quantizer
else:
quantizer = GPTQuantizer(quant_config=weight_config)

quantizer = preprocess_quantizer(model, quantizer_cls=GPTQuantizer, quant_config=weight_config)
model = quantizer.execute(model, mode=mode, *args, **kwargs)
if getattr(model, "quantizer", False):
del model.quantizer
else:
model.quantizer = quantizer
postprocess_model(model, mode, quantizer)

return model


Expand Down Expand Up @@ -177,17 +165,10 @@ def static_quant_entry(
inplace = kwargs.get("inplace", True)
assert example_inputs is not None, "Please provide example_inputs for static quantization."

if getattr(model, "quantizer", False):
quantizer = model.quantizer
else:
quantizer = StaticQuantQuantizer(quant_config=quant_config_mapping)

quantizer = preprocess_quantizer(model, quantizer_cls=StaticQuantQuantizer, quant_config=quant_config_mapping)
model = quantizer.execute(model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace)
postprocess_model(model, mode, quantizer)

if getattr(model, "quantizer", False):
del model.quantizer
else:
model.quantizer = quantizer
return model


Expand Down Expand Up @@ -301,11 +282,7 @@ def awq_quantize_entry(
example_inputs = kwargs.get("example_inputs", None)
assert example_inputs is not None, "Please provide example_inputs for AWQ quantization."

if getattr(model, "quantizer", False):
quantizer = model.quantizer
else:
quantizer = AWQQuantizer(quant_config=weight_config)

quantizer = preprocess_quantizer(model, quantizer_cls=AWQQuantizer, quant_config=weight_config)
model = quantizer.execute(
model,
mode=mode,
Expand All @@ -318,11 +295,8 @@ def awq_quantize_entry(
return_int=return_int,
use_full_range=use_full_range,
)
postprocess_model(model, mode, quantizer)

if getattr(model, "quantizer", False):
del model.quantizer
else:
model.quantizer = quantizer
return model


Expand Down Expand Up @@ -364,10 +338,18 @@ def teq_quantize_entry(
absorb_to_layer = quant_config.absorb_to_layer
folding = quant_config.folding
assert isinstance(model, torch.nn.Module), "only support torch module"
quantizer = TEQuantizer(
quant_config=weight_config, folding=folding, absorb_to_layer=absorb_to_layer, example_inputs=example_inputs

quantizer = preprocess_quantizer(
model,
quantizer_cls=TEQuantizer,
quant_config=weight_config,
folding=folding,
absorb_to_layer=absorb_to_layer,
example_inputs=example_inputs,
)
model = quantizer.execute(model, mode=mode, run_fn=run_fn, example_inputs=example_inputs, inplace=inplace)
postprocess_model(model, mode, quantizer)

return model


Expand Down Expand Up @@ -414,35 +396,33 @@ def autoround_quantize_entry(
scale_dtype = quant_config.scale_dtype

kwargs.pop("example_inputs")
if getattr(model, "quantizer", False):
quantizer = model.quantizer
else:
quantizer = AutoRoundQuantizer(
weight_config=weight_config,
enable_full_range=enable_full_range,
batch_size=batch_size,
lr_scheduler=lr_scheduler,
use_quant_input=use_quant_input,
enable_minmax_tuning=enable_minmax_tuning,
lr=lr,
minmax_lr=minmax_lr,
low_gpu_mem_usage=low_gpu_mem_usage,
iters=iters,
seqlen=seqlen,
n_samples=n_samples,
sampler=sampler,
seed=seed,
n_blocks=n_blocks,
gradient_accumulate_steps=gradient_accumulate_steps,
not_use_best_mse=not_use_best_mse,
dynamic_max_gap=dynamic_max_gap,
scale_dtype=scale_dtype,
)

quantizer = preprocess_quantizer(
model,
quantizer_cls=AutoRoundQuantizer,
quant_config=weight_config,
enable_full_range=enable_full_range,
batch_size=batch_size,
lr_scheduler=lr_scheduler,
use_quant_input=use_quant_input,
enable_minmax_tuning=enable_minmax_tuning,
lr=lr,
minmax_lr=minmax_lr,
low_gpu_mem_usage=low_gpu_mem_usage,
iters=iters,
seqlen=seqlen,
n_samples=n_samples,
sampler=sampler,
seed=seed,
n_blocks=n_blocks,
gradient_accumulate_steps=gradient_accumulate_steps,
not_use_best_mse=not_use_best_mse,
dynamic_max_gap=dynamic_max_gap,
scale_dtype=scale_dtype,
)
model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
if getattr(model, "quantizer", False):
del model.quantizer
else:
model.quantizer = quantizer
postprocess_model(model, mode, quantizer)

logger.info("AutoRound quantization done.")
return model

Expand All @@ -460,17 +440,11 @@ def hqq_entry(
from neural_compressor.torch.algorithms.weight_only.hqq import HQQuantizer

logger.info("Quantize model with the HQQ algorithm.")
if getattr(model, "quantizer", False):
quantizer = model.quantizer
else:
quantizer = HQQuantizer(quant_config=configs_mapping)

quantizer = preprocess_quantizer(model, quantizer_cls=HQQuantizer, quant_config=configs_mapping)
model = quantizer.execute(model, mode=mode)
postprocess_model(model, mode, quantizer)

if getattr(model, "quantizer", False):
del model.quantizer
else:
model.quantizer = quantizer
return model


Expand Down
36 changes: 36 additions & 0 deletions neural_compressor/torch/utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,39 @@ class Mode(Enum):
PREPARE = "prepare"
CONVERT = "convert"
QUANTIZE = "quantize"


def preprocess_quantizer(model, quantizer_cls, quant_config=None, *args, **kwargs):
yuwenzho marked this conversation as resolved.
Show resolved Hide resolved
"""Process quantizer.

Initialize a quantizer or get `quantizer` attribute from model.

Args:
model (torch.nn.Module): pytorch model.
quantizer_cls (Quantizer): quantizer class of a specific algorithm.
quant_config (dict, optional): Specifies how to apply the algorithm on the given model.
Defaults to None.

Returns:
quantizer object.
"""
if not hasattr(model, "quantizer"):
quantizer = quantizer_cls(quant_config=quant_config, *args, **kwargs)
return quantizer
else:
return model.quantizer


def postprocess_model(model, mode, quantizer):
"""Process `quantizer` attribute of model according to current mode.

Args:
model (torch.nn.Module): pytorch model.
mode (Mode): The mode of current phase, including 'prepare', 'convert' and 'quantize'.
quantizer (Quantizer): quantizer object.
"""
if mode == Mode.PREPARE:
yuwenzho marked this conversation as resolved.
Show resolved Hide resolved
model.quantizer = quantizer
elif mode == Mode.CONVERT or mode == Mode.QUANTIZE:
if getattr(model, "quantizer", False):
del model.quantizer
2 changes: 1 addition & 1 deletion test/3x/torch/quantization/weight_only/test_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_quantizer(self):
"sym": False,
}
}
quantizer = AutoRoundQuantizer(weight_config=weight_config)
quantizer = AutoRoundQuantizer(quant_config=weight_config)
fp32_model = gpt_j_model

# quantizer execute
Expand Down
Loading