From 295535ac8b0f957deda236f4b06e5565b43974fd Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Tue, 5 Sep 2023 14:39:25 +0800
Subject: [PATCH] enhance onnxrt backend setting

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
---
 neural_compressor/adaptor/onnxrt.py                | 8 ++++++--
 neural_compressor/model/model.py                   | 4 ++--
 test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py | 9 +++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py
index 457b9818027..03ca50b9364 100644
--- a/neural_compressor/adaptor/onnxrt.py
+++ b/neural_compressor/adaptor/onnxrt.py
@@ -83,6 +83,10 @@ def __init__(self, framework_specific_info):
                 self.format = "integerops"
                 if "format" in framework_specific_info and framework_specific_info["format"].lower() == "qdq":
                     logger.warning("Dynamic approach doesn't support QDQ format.")
+        
+        # do not load TensorRT if backend is not TensorrtExecutionProvider
+        if self.backend != "TensorrtExecutionProvider":
+            os.environ["ORT_TENSORRT_UNAVAILABLE"] = "1"
 
         # get quantization config file according to backend
         config_file = None
@@ -700,9 +704,9 @@ def _detect_domain(self, model):
         # typically, NLP models have multiple inputs,
         # and the dimension of each input is usually 2 (batch_size, max_seq_len)
         if not model.is_large_model:
-            sess = ort.InferenceSession(model.model.SerializeToString(), providers=ort.get_available_providers())
+            sess = ort.InferenceSession(model.model.SerializeToString(), providers=["CPUExecutionProvider"])
         elif model.model_path is not None:  # pragma: no cover
-            sess = ort.InferenceSession(model.model_path, providers=ort.get_available_providers())
+            sess = ort.InferenceSession(model.model_path, providers=["CPUExecutionProvider"])
         else:  # pragma: no cover
             assert False, "Please use model path instead of onnx model object to quantize."
         input_shape_lens = [len(input.shape) for input in sess.get_inputs()]
diff --git a/neural_compressor/model/model.py b/neural_compressor/model/model.py
index 032f930d4b9..5480d29906b 100644
--- a/neural_compressor/model/model.py
+++ b/neural_compressor/model/model.py
@@ -83,9 +83,9 @@ def _is_onnxruntime(model):
 
                 so.register_custom_ops_library(get_library_path())
             if isinstance(model, str):
-                ort.InferenceSession(model, so, providers=ort.get_available_providers())
+                ort.InferenceSession(model, so, providers=["CPUExecutionProvider"])
             else:
-                ort.InferenceSession(model.SerializeToString(), so, providers=ort.get_available_providers())
+                ort.InferenceSession(model.SerializeToString(), so, providers=["CPUExecutionProvider"])
         except Exception as e:  # pragma: no cover
             if "Message onnx.ModelProto exceeds maximum protobuf size of 2GB" in str(e):
                 logger.warning("Please use model path instead of onnx model object to quantize")
diff --git a/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py b/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py
index 121ab613ea7..f149b758b01 100644
--- a/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py
+++ b/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py
@@ -1657,6 +1657,15 @@ def test_backend(self, mock_warning):
 
         self.assertEqual(mock_warning.call_count, 2)
 
+    def test_cuda_ep_env_set(self):
+        config = PostTrainingQuantConfig(approach="static", backend="onnxrt_cuda_ep", device="gpu", quant_level=1)
+        q_model = quantization.fit(
+            self.distilbert_model,
+            config,
+            calib_dataloader=DummyNLPDataloader_dict("distilbert-base-uncased-finetuned-sst-2-english")
+        )
+        self.assertEqual(os.environ.get("ORT_TENSORRT_UNAVAILABLE"), "1")
+
 
 if __name__ == "__main__":
     unittest.main()