From 36075e611b3b3cbbdcadad57ff73a9f987b00f3e Mon Sep 17 00:00:00 2001
From: Mark Mozolewski <mark.mozolewski@gmail.com>
Date: Fri, 16 Feb 2024 17:48:43 +0000
Subject: [PATCH 1/2] Add code-revision argument for Hugging Face Hub

Introduce the `code-revision` argument to facilitate the selection of a specific revision when fetching model code from the Hugging Face Hub. Prior to this enhancement, the default behavior was to always download the latest version which may not always be intended. Users can now specify a revision to ensure consistent model code retrieval. The default behavior remains unchanged, automatically fetching the latest version when the argument is not provided.

Run yapf and ruff
---
 vllm/config.py                    |  8 +++++++-
 vllm/engine/arg_utils.py          | 14 +++++++++++---
 vllm/transformers_utils/config.py | 12 +++++++++---
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 27c61d4d50439..0b8a2a27f6d43 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -44,6 +44,9 @@ class ModelConfig:
         revision: The specific model version to use. It can be a branch name,
             a tag name, or a commit id. If unspecified, will use the default
             version.
+        code_revision: The specific revision to use for the model code on
+            Hugging Face Hub. It can be a branch name, a tag name, or a 
+            commit id. If unspecified, will use the default version.
         tokenizer_revision: The specific tokenizer version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified, will use
             the default version.
@@ -70,6 +73,7 @@ def __init__(
         dtype: Union[str, torch.dtype],
         seed: int,
         revision: Optional[str] = None,
+        code_revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
@@ -84,6 +88,7 @@ def __init__(
         self.load_format = load_format
         self.seed = seed
         self.revision = revision
+        self.code_revision = code_revision
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.enforce_eager = enforce_eager
@@ -103,7 +108,8 @@ def __init__(
             self.download_dir = model_path
             self.tokenizer = model_path
 
-        self.hf_config = get_config(self.model, trust_remote_code, revision)
+        self.hf_config = get_config(self.model, trust_remote_code, revision,
+                                    code_revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
         self.max_model_len = _get_and_verify_max_len(self.hf_config,
                                                      max_model_len)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d5e63e25d6e85..97dc271b43c7d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -32,6 +32,7 @@ class EngineArgs:
     max_paddings: int = 256
     disable_log_stats: bool = False
     revision: Optional[str] = None
+    code_revision: Optional[str] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
     enforce_eager: bool = False
@@ -75,6 +76,13 @@ def add_cli_args(
             help='the specific model version to use. It can be a branch '
             'name, a tag name, or a commit id. If unspecified, will use '
             'the default version.')
+        parser.add_argument(
+            '--code-revision',
+            type=str,
+            default=None,
+            help='the specific revision to use for the model code on '
+            'Hugging Face Hub. It can be a branch name, a tag name, or a '
+            'commit id. If unspecified, will use the default version.')        
         parser.add_argument(
             '--tokenizer-revision',
             type=str,
@@ -282,9 +290,9 @@ def create_engine_configs(
         model_config = ModelConfig(self.model, self.tokenizer,
                                    self.tokenizer_mode, self.trust_remote_code,
                                    self.download_dir, self.load_format,
-                                   self.dtype, self.seed, self.revision,
-                                   self.tokenizer_revision, self.max_model_len,
-                                   self.quantization, self.enforce_eager,
+                                   self.dtype, self.seed, self.revision, self.code_revision, 
+                                   self.tokenizer_revision, self.max_model_len, 
+                                   self.quantization, self.enforce_eager, 
                                    self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index b12918e41b32e..491cb4d9a427c 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -16,10 +16,14 @@
 
 def get_config(model: str,
                trust_remote_code: bool,
-               revision: Optional[str] = None) -> PretrainedConfig:
+               revision: Optional[str] = None,
+               code_revision: Optional[str] = None) -> PretrainedConfig:
     try:
         config = AutoConfig.from_pretrained(
-            model, trust_remote_code=trust_remote_code, revision=revision)
+            model,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            code_revision=code_revision)
     except ValueError as e:
         if (not trust_remote_code and
                 "requires you to execute the configuration file" in str(e)):
@@ -33,5 +37,7 @@ def get_config(model: str,
             raise e
     if config.model_type in _CONFIG_REGISTRY:
         config_class = _CONFIG_REGISTRY[config.model_type]
-        config = config_class.from_pretrained(model, revision=revision)
+        config = config_class.from_pretrained(model,
+                                              revision=revision,
+                                              code_revision=code_revision)
     return config

From c402ed9372b405ded987c0d90a96ec60cccd4556 Mon Sep 17 00:00:00 2001
From: Mark Mozolewski <mark.mozolewski@gmail.com>
Date: Fri, 16 Feb 2024 17:59:06 +0000
Subject: [PATCH 2/2] Run yapf and ruff

---
 vllm/engine/arg_utils.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 97dc271b43c7d..8ac0157151d8e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -82,7 +82,7 @@ def add_cli_args(
             default=None,
             help='the specific revision to use for the model code on '
             'Hugging Face Hub. It can be a branch name, a tag name, or a '
-            'commit id. If unspecified, will use the default version.')        
+            'commit id. If unspecified, will use the default version.')
         parser.add_argument(
             '--tokenizer-revision',
             type=str,
@@ -287,13 +287,12 @@ def create_engine_configs(
     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig,
                DeviceConfig, Optional[LoRAConfig]]:
         device_config = DeviceConfig(self.device)
-        model_config = ModelConfig(self.model, self.tokenizer,
-                                   self.tokenizer_mode, self.trust_remote_code,
-                                   self.download_dir, self.load_format,
-                                   self.dtype, self.seed, self.revision, self.code_revision, 
-                                   self.tokenizer_revision, self.max_model_len, 
-                                   self.quantization, self.enforce_eager, 
-                                   self.max_context_len_to_capture)
+        model_config = ModelConfig(
+            self.model, self.tokenizer, self.tokenizer_mode,
+            self.trust_remote_code, self.download_dir, self.load_format,
+            self.dtype, self.seed, self.revision, self.code_revision,
+            self.tokenizer_revision, self.max_model_len, self.quantization,
+            self.enforce_eager, self.max_context_len_to_capture)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,