From ca41e8b06beb65ba045efa2aa44a24bfbe169b24 Mon Sep 17 00:00:00 2001
From: WRH <12756472+wangruohui@users.noreply.github.com>
Date: Wed, 4 Oct 2023 05:38:38 +0800
Subject: [PATCH] support `trust_remote_code` in inference test (#709)

* support trust_remote_code

* make trust_remote _code as an argument

---------

Co-authored-by: Ammar Ahmad Awan <ammar.awan@microsoft.com>
Co-authored-by: Michael Wyatt <mrwyattii@gmail.com>
---
 inference/huggingface/text-generation/arguments.py    |  3 ++-
 .../huggingface/text-generation/inference-test.py     |  6 +++---
 inference/huggingface/text-generation/utils.py        | 11 ++++++-----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/inference/huggingface/text-generation/arguments.py b/inference/huggingface/text-generation/arguments.py
index 87d2def0e..b50198ff9 100644
--- a/inference/huggingface/text-generation/arguments.py
+++ b/inference/huggingface/text-generation/arguments.py
@@ -17,4 +17,5 @@
 parser.add_argument("--test_performance", action='store_true', help="enable latency, bandwidth, and throughout testing")
 parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
 parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world_size")
-parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing")
\ No newline at end of file
+parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing")
+parser.add_argument("--trust_remote_code", action='store_true', help="Trust remote code for hugging face models")
\ No newline at end of file
diff --git a/inference/huggingface/text-generation/inference-test.py b/inference/huggingface/text-generation/inference-test.py
index f8e1dc548..827d8db35 100644
--- a/inference/huggingface/text-generation/inference-test.py
+++ b/inference/huggingface/text-generation/inference-test.py
@@ -24,7 +24,8 @@
                   dtype=data_type,
                   is_meta=args.use_meta_tensor,
                   device=args.local_rank,
-                  checkpoint_path=args.checkpoint_path)
+                  checkpoint_path=args.checkpoint_path,
+                  trust_remote_code=args.trust_remote_code)
 
 if args.local_rank == 0:
     print(f"initialization time: {(time.time()-t0) * 1000}ms")
@@ -51,7 +52,7 @@
                                     save_mp_checkpoint_path=args.save_mp_checkpoint_path,
                                     **ds_kwargs
                                     )
-  
+
 if args.local_rank == 0:
     see_memory_usage("after init_inference", True)
 
@@ -90,4 +91,3 @@
         print(f"\nin={i}\nout={o}\n{'-'*60}")
     if args.test_performance:
         Performance.print_perf_stats(map(lambda t: t / args.max_new_tokens, times), pipe.model.config, args.dtype, args.batch_size)
-
diff --git a/inference/huggingface/text-generation/utils.py b/inference/huggingface/text-generation/utils.py
index 096b2f40d..173eac039 100644
--- a/inference/huggingface/text-generation/utils.py
+++ b/inference/huggingface/text-generation/utils.py
@@ -21,7 +21,8 @@ def __init__(self,
                  dtype=torch.float16,
                  is_meta=True,
                  device=-1,
-                 checkpoint_path=None
+                 checkpoint_path=None,
+                 trust_remote_code=False,
                  ):
         self.model_name = model_name
         self.dtype = dtype
@@ -38,18 +39,18 @@ def __init__(self,
         # the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.
         self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"]
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=trust_remote_code)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
         if (is_meta):
             '''When meta tensors enabled, use checkpoints'''
-            self.config = AutoConfig.from_pretrained(self.model_name)
+            self.config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=trust_remote_code)
             self.repo_root, self.checkpoints_json = self._generate_json(checkpoint_path)
 
             with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
-                self.model = AutoModelForCausalLM.from_config(self.config)
+                self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=trust_remote_code)
         else:
-            self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=trust_remote_code)
 
         self.model.eval()