microsoft · mrwyattii · Oct 3, 2023 · Aug 30, 2023 · Sep 12, 2023 · Sep 12, 2023
@@ -17,4 +17,5 @@
 parser.add_argument("--test_performance", action='store_true', help="enable latency, bandwidth, and throughout testing")
 parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
 parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world_size")
-parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing")
+parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing")
+parser.add_argument("--trust_remote_code", action='store_true', help="Trust remote code for hugging face models")
@@ -24,7 +24,8 @@
                   dtype=data_type,
                   is_meta=args.use_meta_tensor,
                   device=args.local_rank,
-                  checkpoint_path=args.checkpoint_path)
+                  checkpoint_path=args.checkpoint_path,
+                  trust_remote_code=args.trust_remote_code)
 
 if args.local_rank == 0:
     print(f"initialization time: {(time.time()-t0) * 1000}ms")
@@ -51,7 +52,7 @@
                                     save_mp_checkpoint_path=args.save_mp_checkpoint_path,
                                     **ds_kwargs
                                     )
-  
+
 if args.local_rank == 0:
     see_memory_usage("after init_inference", True)
 
@@ -90,4 +91,3 @@
         print(f"\nin={i}\nout={o}\n{'-'*60}")
     if args.test_performance:
         Performance.print_perf_stats(map(lambda t: t / args.max_new_tokens, times), pipe.model.config, args.dtype, args.batch_size)
-
@@ -21,7 +21,8 @@ def __init__(self,
                  dtype=torch.float16,
                  is_meta=True,
                  device=-1,
-                 checkpoint_path=None
+                 checkpoint_path=None,
+                 trust_remote_code=False,
                  ):
         self.model_name = model_name
         self.dtype = dtype
@@ -38,18 +39,18 @@ def __init__(self,
         # the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.
         self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"]
 
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=trust_remote_code)
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
         if (is_meta):
             '''When meta tensors enabled, use checkpoints'''
-            self.config = AutoConfig.from_pretrained(self.model_name)
+            self.config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=trust_remote_code)
             self.repo_root, self.checkpoints_json = self._generate_json(checkpoint_path)
 
             with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
-                self.model = AutoModelForCausalLM.from_config(self.config)
+                self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=trust_remote_code)
         else:
-            self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=trust_remote_code)
 
         self.model.eval()