From 99f797d2b2d3597411084645a79a094adf84173c Mon Sep 17 00:00:00 2001 From: changwangss Date: Fri, 6 Sep 2024 03:25:24 -0700 Subject: [PATCH] improve ut Signed-off-by: changwangss --- .github/workflows/test_inc.yml | 1 + tests/neural_compressor/test_modeling.py | 14 ++++++----- tests/neural_compressor/test_optimization.py | 26 ++++++++++++++------ 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index 9f4e532244..51f7a39cff 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -37,6 +37,7 @@ jobs: pip install py-cpuinfo pip install torch==2.3.0 torchaudio==2.3.0 torchvision==0.18 --index-url https://download.pytorch.org/whl/cpu pip install .[neural-compressor,diffusers,tests] + pip install intel_extension_for_pytorch==2.3.0 pip install peft - name: Test with Pytest diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index f8eb000564..183ee27ced 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -147,23 +147,25 @@ def test_compare_with_and_without_past_key_values(self): self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv)) def test_saving_loading_inc_woq_model(self): - model_name = "TheBlokeAI/Mixtral-tiny-GPTQ" - subfolder = "inc" - model = INCModelForCausalLM.from_pretrained(model_name, revision="inc", subfolder=subfolder) - tokenizer = AutoTokenizer.from_pretrained(model_name, revision="inc") + model_name = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ" + subfolder = "inc_woq" + model = INCModelForCausalLM.from_pretrained(model_name, revision="main") + tokenizer = AutoTokenizer.from_pretrained(model_name, revision="main") tokenizer.add_special_tokens({"pad_token": "[PAD]"}) tokens = tokenizer("This is a sample output", return_tensors="pt") + with torch.no_grad(): + outputs = model(**tokens) + with tempfile.TemporaryDirectory() as tmp_dir: model_save_dir = Path(tmp_dir) / subfolder model.save_pretrained(model_save_dir) folder_contents = os.listdir(model_save_dir) self.assertIn(SAFE_WEIGHTS_NAME, folder_contents) self.assertIn(QUANTIZATION_CONFIG_NAME, folder_contents) - loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder) + loaded_model = INCModelForCausalLM.from_pretrained(model_save_dir) with torch.no_grad(): - outputs = model(**tokens) loaded_outputs = loaded_model(**tokens) self.assertTrue("logits" in loaded_outputs) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index 65e970c439..8a5ebc99a4 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -467,22 +467,32 @@ def _compute_metrics(pred): class WeightOnlyQuantizationTest(INCTestMixin): WEIGHT_ONLY_CONFIG = ( - ("rtn", "int4_clip"), - ("gptq", "int4_clip"), + ("rtn", 4), + ("gptq", 4), ) @parameterized.expand(WEIGHT_ONLY_CONFIG) - def test_weight_only_quantization(self, methodology, weight_dtype): + def test_weight_only_quantization(self, methodology, bits): model_name = "hf-internal-testing/tiny-random-GPTNeoForCausalLM" from neural_compressor.transformers import GPTQConfig, RtnConfig - bits = 4 if methodology == "gptq": - # max_input_length can be removed after neural-compressor > v2.5.1 - quantization_config = GPTQConfig(bits=bits, sym=True, damp_percent=0.01) + tokenizer = AutoTokenizer.from_pretrained(model_name) + quantization_config = GPTQConfig( + bits=bits, + sym=True, + damp_percent=0.01, + desc_act=True, + tokenizer=tokenizer, + n_samples=20, + group_size=8, + batch_size=5, + seq_len=32, + block_size=16, + ) else: - quantization_config = RtnConfig(bits=bits) + quantization_config = RtnConfig(bits=bits, group_size=8) tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer.add_special_tokens({"pad_token": "[PAD]"}) @@ -494,7 +504,7 @@ def test_weight_only_quantization(self, methodology, weight_dtype): with torch.no_grad(): quantizer_outputs = quantized_model(**tokens) - quantized_model.saved_pretrained(tmp_dir) + quantized_model.save_pretrained(tmp_dir) loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir) with torch.no_grad(): loaded_outputs = loaded_model(**tokens)