From 3d6975b569a907c146ef6b845a5483a046b8d7f1 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Wed, 17 Jan 2024 19:17:12 -0800 Subject: [PATCH] Remove UT of onnxrt woq tune for large model to save CI time (#1548) Signed-off-by: yuwenzho --- .../test_weight_only_adaptor.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py b/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py index 361f4aae75b..7ed5cb01171 100644 --- a/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py +++ b/test/adaptor/onnxrt_adaptor/test_weight_only_adaptor.py @@ -59,23 +59,11 @@ def setUpClass(self): self.gptj_fp16_model = onnx.load("gptj_fp16/decoder_model.onnx") self.gptj_dataloader = DummyNLPDataloader("hf-internal-testing/tiny-random-gptj") - cmd = ( - "optimum-cli export onnx --model PY007/TinyLlama-1.1B-Chat-v0.3 --task text-generation --legacy tiny-llama/" - ) - p = subprocess.Popen( - cmd, preexec_fn=os.setsid, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True - ) # nosec - p.communicate() - - self.llama_model = "tiny-llama/decoder_model.onnx" - self.llama_dataloader = DummyNLPDataloader("PY007/TinyLlama-1.1B-Chat-v0.3") - @classmethod def tearDownClass(self): shutil.rmtree("nc_workspace", ignore_errors=True) shutil.rmtree("gptj", ignore_errors=True) shutil.rmtree("gptj_fp16", ignore_errors=True) - shutil.rmtree("tiny-llama", ignore_errors=True) @unittest.skipIf("CUDAExecutionProvider" not in ort.get_available_providers(), "Skip cuda woq test") def test_RTN_quant_with_woq_op(self): @@ -478,18 +466,6 @@ def fake_eval(model, eval_result_lst): ) self.assertEqual(self._count_woq_matmul(woq_model, bits=8), 31) - def test_woq_tune_with_large_model(self): - from functools import partial - - def fake_eval(model, eval_result_lst): - acc = eval_result_lst.pop(0) - return acc - - # Expect tuning ends with WOQ algorithm 'RTN_G32ASYM' - partial_fake_eval = partial(fake_eval, eval_result_lst=[1, 1.1]) - woq_model = self._test_woq_tune_common(self.llama_model, self.llama_dataloader, partial_fake_eval) - self.assertEqual(self._count_woq_matmul(woq_model), 155) - def test_woq_with_ModelProto_input(self): from neural_compressor.model.onnx_model import ONNXModel