diff --git a/inference/huggingface/text-generation/arguments.py b/inference/huggingface/text-generation/arguments.py index 87d2def0e..b50198ff9 100644 --- a/inference/huggingface/text-generation/arguments.py +++ b/inference/huggingface/text-generation/arguments.py @@ -17,4 +17,5 @@ parser.add_argument("--test_performance", action='store_true', help="enable latency, bandwidth, and throughout testing") parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank") parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world_size") -parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing") \ No newline at end of file +parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing") +parser.add_argument("--trust_remote_code", action='store_true', help="Trust remote code for hugging face models") \ No newline at end of file diff --git a/inference/huggingface/text-generation/inference-test.py b/inference/huggingface/text-generation/inference-test.py index f8e1dc548..827d8db35 100644 --- a/inference/huggingface/text-generation/inference-test.py +++ b/inference/huggingface/text-generation/inference-test.py @@ -24,7 +24,8 @@ dtype=data_type, is_meta=args.use_meta_tensor, device=args.local_rank, - checkpoint_path=args.checkpoint_path) + checkpoint_path=args.checkpoint_path, + trust_remote_code=args.trust_remote_code) if args.local_rank == 0: print(f"initialization time: {(time.time()-t0) * 1000}ms") @@ -51,7 +52,7 @@ save_mp_checkpoint_path=args.save_mp_checkpoint_path, **ds_kwargs ) - + if args.local_rank == 0: see_memory_usage("after init_inference", True) @@ -90,4 +91,3 @@ print(f"\nin={i}\nout={o}\n{'-'*60}") if args.test_performance: Performance.print_perf_stats(map(lambda t: t / args.max_new_tokens, times), pipe.model.config, args.dtype, args.batch_size) - diff --git a/inference/huggingface/text-generation/utils.py b/inference/huggingface/text-generation/utils.py index 096b2f40d..173eac039 100644 --- a/inference/huggingface/text-generation/utils.py +++ b/inference/huggingface/text-generation/utils.py @@ -21,7 +21,8 @@ def __init__(self, dtype=torch.float16, is_meta=True, device=-1, - checkpoint_path=None + checkpoint_path=None, + trust_remote_code=False, ): self.model_name = model_name self.dtype = dtype @@ -38,18 +39,18 @@ def __init__(self, # the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time. self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"] - self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") + self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", trust_remote_code=trust_remote_code) self.tokenizer.pad_token = self.tokenizer.eos_token if (is_meta): '''When meta tensors enabled, use checkpoints''' - self.config = AutoConfig.from_pretrained(self.model_name) + self.config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=trust_remote_code) self.repo_root, self.checkpoints_json = self._generate_json(checkpoint_path) with deepspeed.OnDevice(dtype=torch.float16, device="meta"): - self.model = AutoModelForCausalLM.from_config(self.config) + self.model = AutoModelForCausalLM.from_config(self.config, trust_remote_code=trust_remote_code) else: - self.model = AutoModelForCausalLM.from_pretrained(self.model_name) + self.model = AutoModelForCausalLM.from_pretrained(self.model_name, trust_remote_code=trust_remote_code) self.model.eval()