From 295535ac8b0f957deda236f4b06e5565b43974fd Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Tue, 5 Sep 2023 14:39:25 +0800 Subject: [PATCH] enhance onnxrt backend setting Signed-off-by: yuwenzho --- neural_compressor/adaptor/onnxrt.py | 8 ++++++-- neural_compressor/model/model.py | 4 ++-- test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py | 9 +++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 457b9818027..03ca50b9364 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -83,6 +83,10 @@ def __init__(self, framework_specific_info): self.format = "integerops" if "format" in framework_specific_info and framework_specific_info["format"].lower() == "qdq": logger.warning("Dynamic approach doesn't support QDQ format.") + + # do not load TensorRT if backend is not TensorrtExecutionProvider + if self.backend != "TensorrtExecutionProvider": + os.environ["ORT_TENSORRT_UNAVAILABLE"] = "1" # get quantization config file according to backend config_file = None @@ -700,9 +704,9 @@ def _detect_domain(self, model): # typically, NLP models have multiple inputs, # and the dimension of each input is usually 2 (batch_size, max_seq_len) if not model.is_large_model: - sess = ort.InferenceSession(model.model.SerializeToString(), providers=ort.get_available_providers()) + sess = ort.InferenceSession(model.model.SerializeToString(), providers=["CPUExecutionProvider"]) elif model.model_path is not None: # pragma: no cover - sess = ort.InferenceSession(model.model_path, providers=ort.get_available_providers()) + sess = ort.InferenceSession(model.model_path, providers=["CPUExecutionProvider"]) else: # pragma: no cover assert False, "Please use model path instead of onnx model object to quantize." input_shape_lens = [len(input.shape) for input in sess.get_inputs()] diff --git a/neural_compressor/model/model.py b/neural_compressor/model/model.py index 032f930d4b9..5480d29906b 100644 --- a/neural_compressor/model/model.py +++ b/neural_compressor/model/model.py @@ -83,9 +83,9 @@ def _is_onnxruntime(model): so.register_custom_ops_library(get_library_path()) if isinstance(model, str): - ort.InferenceSession(model, so, providers=ort.get_available_providers()) + ort.InferenceSession(model, so, providers=["CPUExecutionProvider"]) else: - ort.InferenceSession(model.SerializeToString(), so, providers=ort.get_available_providers()) + ort.InferenceSession(model.SerializeToString(), so, providers=["CPUExecutionProvider"]) except Exception as e: # pragma: no cover if "Message onnx.ModelProto exceeds maximum protobuf size of 2GB" in str(e): logger.warning("Please use model path instead of onnx model object to quantize") diff --git a/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py b/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py index 121ab613ea7..f149b758b01 100644 --- a/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py +++ b/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py @@ -1657,6 +1657,15 @@ def test_backend(self, mock_warning): self.assertEqual(mock_warning.call_count, 2) + def test_cuda_ep_env_set(self): + config = PostTrainingQuantConfig(approach="static", backend="onnxrt_cuda_ep", device="gpu", quant_level=1) + q_model = quantization.fit( + self.distilbert_model, + config, + calib_dataloader=DummyNLPDataloader_dict("distilbert-base-uncased-finetuned-sst-2-english") + ) + self.assertEqual(os.environ.get("ORT_TENSORRT_UNAVAILABLE"), "1") + if __name__ == "__main__": unittest.main()