diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 7b87a20d0c..9f4e532244 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -37,7 +37,6 @@ jobs: pip install py-cpuinfo pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] - pip install intel-extension-for-transformers pip install peft - name: Test with Pytest @@ -45,7 +44,6 @@ jobs: pytest tests/neural_compressor/ --ignore tests/neural_compressor/test_ipex.py --durations=0 - name: Test IPEX run: | - pip uninstall -y intel-extension-for-transformers pip install intel-extension-for-pytorch==2.3.0 pytest tests/neural_compressor/test_ipex.py diff --git a/examples/neural_compressor/language-modeling/README.md b/examples/neural_compressor/language-modeling/README.md index 80d7a25d16..476ac526e1 100644 --- a/examples/neural_compressor/language-modeling/README.md +++ b/examples/neural_compressor/language-modeling/README.md @@ -97,4 +97,4 @@ respectively `dynamic`, `static`, `weight_only` or `aware_training`. The flag `--verify_loading` can be passed along to verify that the resulting quantized model can be loaded correctly. -> **_Note:_** `weight_only` quantization_approach requires `neural-compressor` >= 2.3 and `intel-extension-for-transformers` >= 1.3. +> **_Note:_** `weight_only` quantization_approach requires `neural-compressor` > 3.0. diff --git a/examples/neural_compressor/language-modeling/requirements.txt b/examples/neural_compressor/language-modeling/requirements.txt index ec38e83d2d..8960d82dbc 100644 --- a/examples/neural_compressor/language-modeling/requirements.txt +++ b/examples/neural_compressor/language-modeling/requirements.txt @@ -3,5 +3,4 @@ torch >= 1.9 datasets >= 1.8.0 sentencepiece != 0.1.92 protobuf -intel-extension-for-transformers >= 1.3 peft diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 764526f6ed..f3948872e9 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -139,6 +139,10 @@ def _from_pretrained( _BaseINCAutoModelClass, ) + warnings.warn( + "Weight only quantization model loading provided by intel_extension_for_transformers is deprecated and it is provided by INC now.", + DeprecationWarning, + ) _BaseINCAutoModelClass.ORIG_MODEL = cls.auto_model_class model = _BaseINCAutoModelClass.load_low_bit( model_id, @@ -157,6 +161,10 @@ def _from_pretrained( except Exception as e: raise RuntimeError(f"The quantized model cannot be loaded. Detailed error: {e}") if isinstance(quantization_config, (RtnConfig, GPTQConfig)): + warnings.warn( + "Weight only quantization provided by intel_extension_for_transformers is deprecated and it is provided by INC now.", + DeprecationWarning, + ) model = weight_only_quantization( cls.auto_model_class, model_id, diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 5d53ca6a52..464f6cb325 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -354,10 +354,6 @@ def weight_only_quantization( device_map = kwargs.get("device_map", "xpu" if (hasattr(torch, "xpu") and torch.xpu.is_available()) else "cpu") use_xpu = True if device_map == torch.device("xpu") or device_map == "xpu" else False - warnings.warn( - "Weight only quantization provided by intel_extension_for_transformers is deprecated and it is provided by INC now.", - DeprecationWarning, - ) if is_neural_compressor_version("<=", "3.0"): raise AssertionError("Please use neural_compressor version > 3.0.") if is_ipex_version("<", "2.3.1") and use_xpu: