diff --git a/optimum/intel/neural_compressor/configuration.py b/optimum/intel/neural_compressor/configuration.py index 7f5370e5ee..387c9b49d0 100644 --- a/optimum/intel/neural_compressor/configuration.py +++ b/optimum/intel/neural_compressor/configuration.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging from typing import Dict, Optional, Union from neural_compressor.config import DistillationConfig, WeightPruningConfig, _BaseQuantizationConfig @@ -28,6 +29,8 @@ "post_training_weight_only": "weight_only", } +logger = logging.getLogger(__name__) + class INCConfig(BaseConfig): CONFIG_NAME = "inc_config.json" @@ -49,6 +52,9 @@ def __init__( self.distillation = self._create_distillation_config(distillation) or {} self.save_onnx_model = save_onnx_model + if self.save_onnx_model: + logger.warning("ONNX model saving is deprecated and will be removed soon.") + @staticmethod def _create_quantization_config(config: Union[Dict, _BaseQuantizationConfig]): # TODO : add activations_dtype and weights_dtype diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 90c0c39d68..5133a42bc6 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -200,9 +200,15 @@ def quantize( use_xpu = device == torch.device("xpu") or device == "xpu" calibration_dataloader = None + if save_onnx_model: + logger.warning("ONNX model export is deprecated and will be removed soon.") + + if isinstance(self._original_model, ORTModel): + logger.warning("ONNX model quantization is deprecated and will be removed soon.") + if save_onnx_model and isinstance(self._original_model, ORTModel): + logger.warning("The model provided is already an ONNX model. Setting `save_onnx_model` to False.") save_onnx_model = False - logger.warning("Model provided is an ONNX model, `save_onnx_model` is set to False") default_name = WEIGHTS_NAME if not isinstance(self._original_model, ORTModel) else ONNX_WEIGHTS_NAME self._set_task() @@ -223,13 +229,16 @@ def quantize( f"but only version {IPEX_MINIMUM_VERSION} or higher is supported." ) - if save_onnx_model: - if ( - not isinstance(quantization_config, PostTrainingQuantConfig) - or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC - ): - logger.warning("ONNX export for dynamic and weight only quantized model is not supported.") - save_onnx_model = False + if save_onnx_model and ( + not isinstance(quantization_config, PostTrainingQuantConfig) + or INCQuantizationMode(quantization_config.approach) == INCQuantizationMode.DYNAMIC + ): + logger.warning( + "ONNX export for dynamic and weight only quantized model is not supported. " + "Only static quantization model can be exported to ONNX format. " + "Setting `save_onnx_model` to False." + ) + save_onnx_model = False # ITREX Weight Only Quantization if not isinstance(quantization_config, PostTrainingQuantConfig): @@ -296,9 +305,13 @@ def quantize( remove_unused_columns=remove_unused_columns, data_collator=data_collator, ) + op_type_dict = getattr(quantization_config, "op_type_dict", None) - if op_type_dict is None or "Embedding" not in op_type_dict: - logger.warning("ONNX export is no supported for model with quantized embeddings") + if save_onnx_model and (op_type_dict is None or "Embedding" not in op_type_dict): + logger.warning( + "ONNX export is no supported for model with quantized embeddings. " + "Setting `save_onnx_model` to False." + ) save_onnx_model = False if not isinstance(quantization_config, PostTrainingQuantConfig): diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index df43b43582..ea9fe9a1cd 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -175,6 +175,9 @@ def __init__( # TODO : To deprecate once support transformers > 4.30.0 self.deepspeed = None + if save_onnx_model: + logger.warning("ONNX model saving is deprecated and will be removed soon.") + # Attach dtype and architecture to the config if quantization_config is not None: self.dtype = "int8" @@ -678,15 +681,12 @@ def _inner_training_loop( def save_model( self, output_dir: Optional[str] = None, - _internal_call: bool = False, - save_onnx_model: Optional[bool] = None, + save_onnx_model: bool = False, ): """ Will save the model, so you can reload it using `from_pretrained()`. Will only save from the main process. """ - save_onnx_model = save_onnx_model if save_onnx_model is not None else self.save_onnx_model - if output_dir is None: output_dir = self.args.output_dir @@ -734,7 +734,10 @@ def _save( # Disable ONNX export for quantized model as deprecated in neural-compressor>=2.2.0 if save_onnx_model and self.dtype == "int8": - logger.warning("ONNX export for quantized model is no longer supported by neural-compressor>=2.2.0. ") + logger.warning( + "ONNX export for quantized model is no longer supported by neural-compressor>=2.2.0. " + "Setting `save_onnx_model` to False." + ) save_onnx_model = False # Export the compressed model to the ONNX format