From 98943cf698d00fa865e0f9e9b66d0d4d2a9074cd Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 09:35:01 +0000
Subject: [PATCH 01/17] mistral patch

---
 vllm/model_executor/model_loader/loader.py    | 13 ++++++---
 .../model_loader/weight_utils.py              | 28 +++++++++++++++++++
 2 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 45ea8160a801b..4036d37822f71 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1,6 +1,7 @@
 # ruff: noqa: SIM117
 import collections
 import copy
+import json
 import glob
 import os
 from abc import ABC, abstractmethod
@@ -24,13 +25,13 @@
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_weights_from_hf, filter_files_not_needed_for_inference,
-    get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
+    filter_duplicate_safetensors_files, get_quant_config, 
+    initialize_dummy_weights, np_cache_weights_iterator,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 
 logger = init_logger(__name__)
 
-
 def _get_quantization_config(
         model_config: ModelConfig,
         load_config: LoadConfig) -> Optional[QuantizationConfig]:
@@ -186,9 +187,13 @@ def _prepare_weights(self, model_name_or_path: str,
             if len(hf_weights_files) > 0:
                 if pattern == "*.safetensors":
                     use_safetensors = True
+                
                 break
-
-        if not use_safetensors:
+        
+        if use_safetensors:
+            hf_weights_files = filter_duplicate_safetensors_files(
+                    hf_weights_files, hf_folder)
+        else:
             hf_weights_files = filter_files_not_needed_for_inference(
                 hf_weights_files)
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a1642baa2c90c..c1a0d7a56ab6f 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -24,6 +24,8 @@
 
 logger = init_logger(__name__)
 
+_SAFETENSORS_INDEX_FILE_NAME = "model.safetensors.index.json"
+
 # use system-level temp directory for file locks, so that multiple users
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the
@@ -195,6 +197,8 @@ def download_weights_from_hf(
             if len(matching) > 0:
                 allow_patterns = [pattern]
                 break
+                
+        allow_patterns.append("*model.safetensors.index.json")
 
     logger.info("Using model weights format %s", allow_patterns)
     # Use file lock to prevent multiple processes from
@@ -210,6 +214,30 @@ def download_weights_from_hf(
         )
     return hf_folder
 
+# For models like Mistral-v0.3 (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+# there are both sharded safetensors files and a consolidated safetensors file.
+# Passing both of these to the weight loader functionality breaks.
+# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json` to look up 
+# which safetensors files should be used.
+def filter_duplicate_safetensors_files(
+        hf_weights_files: List[str], hf_folder: str) -> List[str]:
+    # model.safetensors.index.json is a mapping from keys in the 
+    # torch state_dict to safetensors file holding that weight.
+    index_file = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME)
+    if not os.path.isfile(index_file):
+        return hf_weights_files
+    
+    # Iterate through the weight_map (weight_name: safetensors files)
+    # to identify weights that we should use.
+    weight_map = json.load(open(index_file))["weight_map"]
+    weight_files_in_index = set()
+    for weight_name in weight_map:
+        weight_files_in_index.add(os.path.join(hf_folder,
+                                               weight_map[weight_name]))
+    # Filter out any fields that are not found in the index file.
+    hf_weights_files = [f for f in 
+                        hf_weights_files if f in weight_files_in_index]
+    return hf_weights_files
 
 def filter_files_not_needed_for_inference(
         hf_weights_files: List[str]) -> List[str]:

From bc7e431787501b73f6230239dcc5c045d95be2fd Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 10:25:34 +0000
Subject: [PATCH 02/17] cleaned up for upstream

---
 vllm/model_executor/model_loader/loader.py    | 16 ++++++--
 .../model_loader/weight_utils.py              | 39 +++++++++++++++++--
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4036d37822f71..11a08d060e4f0 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -24,8 +24,9 @@
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, filter_files_not_needed_for_inference,
-    filter_duplicate_safetensors_files, get_quant_config, 
+    download_weights_from_hf, download_safetensors_index_file_from_hf,
+    filter_files_not_needed_for_inference,
+    filter_duplicate_safetensors_files, get_quant_config,
     initialize_dummy_weights, np_cache_weights_iterator,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
@@ -164,6 +165,7 @@ def _prepare_weights(self, model_name_or_path: str,
         elif load_format == LoadFormat.SAFETENSORS:
             use_safetensors = True
             allow_patterns = ["*.safetensors"]
+
         elif load_format == LoadFormat.PT:
             allow_patterns = ["*.pt"]
         elif load_format == LoadFormat.NPCACHE:
@@ -187,12 +189,18 @@ def _prepare_weights(self, model_name_or_path: str,
             if len(hf_weights_files) > 0:
                 if pattern == "*.safetensors":
                     use_safetensors = True
-                
                 break
         
         if use_safetensors:
+            # For models like Mistral-7B-Instruct-v0.3
+            # there are both sharded safetensors files and a consolidated safetensors file.
+            # Passing both of these to the weight loader functionality breaks.
+            # Here, we download the `model.safetensors.index.json` and filter out 
+            # any safetensors files not found.
+            download_safetensors_index_file_from_hf(
+                model_name_or_path, self.load_config.download_dir, revision)
             hf_weights_files = filter_duplicate_safetensors_files(
-                    hf_weights_files, hf_folder)
+                hf_weights_files, hf_folder)
         else:
             hf_weights_files = filter_files_not_needed_for_inference(
                 hf_weights_files)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index c1a0d7a56ab6f..74782fa7bc4a0 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -12,7 +12,7 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
-from huggingface_hub import HfFileSystem, snapshot_download
+from huggingface_hub import HfFileSystem, snapshot_download, hf_hub_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
@@ -198,8 +198,6 @@ def download_weights_from_hf(
                 allow_patterns = [pattern]
                 break
                 
-        allow_patterns.append("*model.safetensors.index.json")
-
     logger.info("Using model weights format %s", allow_patterns)
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
@@ -212,8 +210,43 @@ def download_weights_from_hf(
             revision=revision,
             local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
         )
+        
     return hf_folder
 
+
+def download_safetensors_index_file_from_hf(
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    allow_patterns: List[str],
+    revision: Optional[str] = None,
+) -> None:
+    """Download hf safetensors index file from Hugging Face Hub.
+
+    Args:
+        model_name_or_path (str): The model name or path.
+        cache_dir (Optional[str]): The cache directory to store the model
+            weights. If None, will use HF defaults.
+        revision (Optional[str]): The revision of the model.
+    """
+    # Use file lock to prevent multiple processes from
+    # downloading the same model weights at the same time.
+    with get_lock(model_name_or_path, cache_dir):
+        try:
+            # Download the safetensors index file.
+            _ = hf_hub_download(
+                repo_id=model_name_or_path,
+                filename=_SAFETENSORS_INDEX_FILE_NAME,
+                cache_dir=cache_dir,
+                revision=revision,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            )
+        # If file not found on remote or locally, we should not fail since
+        # only some models will have _SAFETENSORS_INDEX_FILE_NAME.
+        except huggingface_hub.utils.EntryNotFoundError:
+            logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in remote.")
+        except huggingface_hub.utils.LocalEntryNotFoundError:
+            logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in local cache.")
+
 # For models like Mistral-v0.3 (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
 # there are both sharded safetensors files and a consolidated safetensors file.
 # Passing both of these to the weight loader functionality breaks.

From 9ebd6d469f1823f09a59ebb2cb7dcb281c2a9882 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 10:34:33 +0000
Subject: [PATCH 03/17] format

---
 vllm/model_executor/model_loader/loader.py    | 17 ++++---
 .../model_loader/weight_utils.py              | 44 ++++++++++---------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 11a08d060e4f0..18da9e58dcde7 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1,7 +1,6 @@
 # ruff: noqa: SIM117
 import collections
 import copy
-import json
 import glob
 import os
 from abc import ABC, abstractmethod
@@ -24,15 +23,15 @@
 from vllm.model_executor.model_loader.utils import (get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
-    download_weights_from_hf, download_safetensors_index_file_from_hf,
-    filter_files_not_needed_for_inference,
-    filter_duplicate_safetensors_files, get_quant_config,
-    initialize_dummy_weights, np_cache_weights_iterator,
+    download_safetensors_index_file_from_hf, download_weights_from_hf,
+    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
+    get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
     pt_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 
 logger = init_logger(__name__)
 
+
 def _get_quantization_config(
         model_config: ModelConfig,
         load_config: LoadConfig) -> Optional[QuantizationConfig]:
@@ -190,12 +189,12 @@ def _prepare_weights(self, model_name_or_path: str,
                 if pattern == "*.safetensors":
                     use_safetensors = True
                 break
-        
+
         if use_safetensors:
             # For models like Mistral-7B-Instruct-v0.3
-            # there are both sharded safetensors files and a consolidated safetensors file.
-            # Passing both of these to the weight loader functionality breaks.
-            # Here, we download the `model.safetensors.index.json` and filter out 
+            # there are both sharded safetensors files and a consolidated
+            # safetensors file. Using both in the weight_loader breaks.
+            # Here, we download the `model.safetensors.index.json` and filter
             # any safetensors files not found.
             download_safetensors_index_file_from_hf(
                 model_name_or_path, self.load_config.download_dir, revision)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 74782fa7bc4a0..8bdf536305556 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -12,7 +12,7 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
-from huggingface_hub import HfFileSystem, snapshot_download, hf_hub_download
+from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
@@ -197,7 +197,7 @@ def download_weights_from_hf(
             if len(matching) > 0:
                 allow_patterns = [pattern]
                 break
-                
+
     logger.info("Using model weights format %s", allow_patterns)
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
@@ -210,14 +210,13 @@ def download_weights_from_hf(
             revision=revision,
             local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
         )
-        
+
     return hf_folder
 
 
 def download_safetensors_index_file_from_hf(
     model_name_or_path: str,
     cache_dir: Optional[str],
-    allow_patterns: List[str],
     revision: Optional[str] = None,
 ) -> None:
     """Download hf safetensors index file from Hugging Face Hub.
@@ -243,35 +242,40 @@ def download_safetensors_index_file_from_hf(
         # If file not found on remote or locally, we should not fail since
         # only some models will have _SAFETENSORS_INDEX_FILE_NAME.
         except huggingface_hub.utils.EntryNotFoundError:
-            logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in remote.")
+            logger.info("No %s found in remote.", _SAFETENSORS_INDEX_FILE_NAME)
         except huggingface_hub.utils.LocalEntryNotFoundError:
-            logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in local cache.")
+            logger.info("No %s found in local cache.",
+                        _SAFETENSORS_INDEX_FILE_NAME)
+
 
-# For models like Mistral-v0.3 (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)
+# For models like Mistral-7B-v0.3
 # there are both sharded safetensors files and a consolidated safetensors file.
 # Passing both of these to the weight loader functionality breaks.
-# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json` to look up 
-# which safetensors files should be used.
-def filter_duplicate_safetensors_files(
-        hf_weights_files: List[str], hf_folder: str) -> List[str]:
-    # model.safetensors.index.json is a mapping from keys in the 
+# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json`
+# to look up which safetensors files should be used.
+def filter_duplicate_safetensors_files(hf_weights_files: List[str],
+                                       hf_folder: str) -> List[str]:
+    # model.safetensors.index.json is a mapping from keys in the
     # torch state_dict to safetensors file holding that weight.
-    index_file = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME)
-    if not os.path.isfile(index_file):
+    index_file_name = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME)
+    if not os.path.isfile(index_file_name):
         return hf_weights_files
-    
+
     # Iterate through the weight_map (weight_name: safetensors files)
     # to identify weights that we should use.
-    weight_map = json.load(open(index_file))["weight_map"]
+    with open(index_file_name) as index_file:
+        weight_map = json.load(index_file)["weight_map"]
     weight_files_in_index = set()
     for weight_name in weight_map:
-        weight_files_in_index.add(os.path.join(hf_folder,
-                                               weight_map[weight_name]))
+        weight_files_in_index.add(
+            os.path.join(hf_folder, weight_map[weight_name]))
     # Filter out any fields that are not found in the index file.
-    hf_weights_files = [f for f in 
-                        hf_weights_files if f in weight_files_in_index]
+    hf_weights_files = [
+        f for f in hf_weights_files if f in weight_files_in_index
+    ]
     return hf_weights_files
 
+
 def filter_files_not_needed_for_inference(
         hf_weights_files: List[str]) -> List[str]:
     """

From f98f7227544f055b95277fe56757f55612d04d30 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 10:38:31 +0000
Subject: [PATCH 04/17] added test

---
 tests/models/test_mistral.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index d0a5bfbfcd922..76b248cf14e98 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -8,6 +8,7 @@
 
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.3",
 ]
 
 

From 56942e6c87093892293f13538c2019622a907c06 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 10:50:02 +0000
Subject: [PATCH 05/17] updated to use transformers const

---
 vllm/model_executor/model_loader/weight_utils.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8bdf536305556..2dd9d0d0d036e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -12,6 +12,7 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
@@ -24,8 +25,6 @@
 
 logger = init_logger(__name__)
 
-_SAFETENSORS_INDEX_FILE_NAME = "model.safetensors.index.json"
-
 # use system-level temp directory for file locks, so that multiple users
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the
@@ -234,30 +233,30 @@ def download_safetensors_index_file_from_hf(
             # Download the safetensors index file.
             _ = hf_hub_download(
                 repo_id=model_name_or_path,
-                filename=_SAFETENSORS_INDEX_FILE_NAME,
+                filename=SAFE_WEIGHTS_INDEX_NAME,
                 cache_dir=cache_dir,
                 revision=revision,
                 local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
             )
         # If file not found on remote or locally, we should not fail since
-        # only some models will have _SAFETENSORS_INDEX_FILE_NAME.
+        # only some models will have SAFE_WEIGHTS_INDEX_NAME.
         except huggingface_hub.utils.EntryNotFoundError:
-            logger.info("No %s found in remote.", _SAFETENSORS_INDEX_FILE_NAME)
+            logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME)
         except huggingface_hub.utils.LocalEntryNotFoundError:
             logger.info("No %s found in local cache.",
-                        _SAFETENSORS_INDEX_FILE_NAME)
+                        SAFE_WEIGHTS_INDEX_NAME)
 
 
 # For models like Mistral-7B-v0.3
 # there are both sharded safetensors files and a consolidated safetensors file.
 # Passing both of these to the weight loader functionality breaks.
-# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json`
+# So, we use the SAFE_WEIGHTS_INDEX_NAME
 # to look up which safetensors files should be used.
 def filter_duplicate_safetensors_files(hf_weights_files: List[str],
                                        hf_folder: str) -> List[str]:
     # model.safetensors.index.json is a mapping from keys in the
     # torch state_dict to safetensors file holding that weight.
-    index_file_name = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME)
+    index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME)
     if not os.path.isfile(index_file_name):
         return hf_weights_files
 

From 5f5d057b4c429846f2b89c8a70f829be7a98c661 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 10:50:43 +0000
Subject: [PATCH 06/17] format

---
 vllm/model_executor/model_loader/weight_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 2dd9d0d0d036e..7838ea4ab5715 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -12,10 +12,10 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
 from vllm.config import LoadConfig, ModelConfig
 from vllm.logger import init_logger
@@ -243,8 +243,7 @@ def download_safetensors_index_file_from_hf(
         except huggingface_hub.utils.EntryNotFoundError:
             logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME)
         except huggingface_hub.utils.LocalEntryNotFoundError:
-            logger.info("No %s found in local cache.",
-                        SAFE_WEIGHTS_INDEX_NAME)
+            logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME)
 
 
 # For models like Mistral-7B-v0.3

From 389d025edaab048c60fa993d4f01f742926bb308 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 10:52:58 +0000
Subject: [PATCH 07/17] cleanup stray newline

---
 vllm/model_executor/model_loader/loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 18da9e58dcde7..de512d3b60fcf 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -164,7 +164,6 @@ def _prepare_weights(self, model_name_or_path: str,
         elif load_format == LoadFormat.SAFETENSORS:
             use_safetensors = True
             allow_patterns = ["*.safetensors"]
-
         elif load_format == LoadFormat.PT:
             allow_patterns = ["*.pt"]
         elif load_format == LoadFormat.NPCACHE:

From 6c44a4797c41d9bd42d7ffca6d9f8afffb912892 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 23 May 2024 17:19:56 +0200
Subject: [PATCH 08/17] Update vllm/model_executor/model_loader/weight_utils.py

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 7838ea4ab5715..b0a3f46e70ed7 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -231,7 +231,7 @@ def download_safetensors_index_file_from_hf(
     with get_lock(model_name_or_path, cache_dir):
         try:
             # Download the safetensors index file.
-            _ = hf_hub_download(
+            hf_hub_download(
                 repo_id=model_name_or_path,
                 filename=SAFE_WEIGHTS_INDEX_NAME,
                 cache_dir=cache_dir,

From 23044da8b11f4cced1ed3ca792e0dcc94a82c35b Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 23 May 2024 17:21:18 +0200
Subject: [PATCH 09/17] Update weight_utils.py

---
 vllm/model_executor/model_loader/weight_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b0a3f46e70ed7..16f587be4972f 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -209,7 +209,6 @@ def download_weights_from_hf(
             revision=revision,
             local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
         )
-
     return hf_folder
 
 

From a61eee275f8ffcd35f86dae61ba0c89bb69fa6ba Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 23 May 2024 19:35:55 +0200
Subject: [PATCH 10/17] Update weight_utils.py

---
 vllm/model_executor/model_loader/weight_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 16f587be4972f..f6087be67f676 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -245,8 +245,8 @@ def download_safetensors_index_file_from_hf(
             logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME)
 
 
-# For models like Mistral-7B-v0.3
-# there are both sharded safetensors files and a consolidated safetensors file.
+# For models like Mistral-7B-v0.3, there are both sharded
+# safetensors files and a consolidated safetensors file.
 # Passing both of these to the weight loader functionality breaks.
 # So, we use the SAFE_WEIGHTS_INDEX_NAME
 # to look up which safetensors files should be used.

From 97c0b2ff206fa2aa0204460ebc5f7ad68fa70f48 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 20:20:37 +0000
Subject: [PATCH 11/17] updated

---
 vllm/model_executor/model_loader/loader.py       | 6 ++++--
 vllm/model_executor/model_loader/weight_utils.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index de512d3b60fcf..36ffeccd6da25 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -195,8 +195,10 @@ def _prepare_weights(self, model_name_or_path: str,
             # safetensors file. Using both in the weight_loader breaks.
             # Here, we download the `model.safetensors.index.json` and filter
             # any safetensors files not found.
-            download_safetensors_index_file_from_hf(
-                model_name_or_path, self.load_config.download_dir, revision)
+            if not is_local:
+                download_safetensors_index_file_from_hf(
+                    model_name_or_path, self.load_config.download_dir,
+                    revision)
             hf_weights_files = filter_duplicate_safetensors_files(
                 hf_weights_files, hf_folder)
         else:
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f6087be67f676..b295242f29837 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -12,7 +12,8 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
-from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+from huggingface_hub import (HfFileSystem, hf_hub_download, snapshot_download,
+                             repo_exists)
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME

From 8f9c79291287b896565e84e3159ffaafb59ec138 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Thu, 23 May 2024 20:22:24 +0000
Subject: [PATCH 12/17] updated

---
 vllm/model_executor/model_loader/weight_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b295242f29837..f6087be67f676 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -12,8 +12,7 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
-from huggingface_hub import (HfFileSystem, hf_hub_download, snapshot_download,
-                             repo_exists)
+from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME

From cf5bc213eb01743339448156b447677350e5c4e0 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Thu, 23 May 2024 23:05:19 +0200
Subject: [PATCH 13/17] Update weight_utils.py

update to re-run CI
---
 vllm/model_executor/model_loader/weight_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index f6087be67f676..b271be6f48dcb 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -248,8 +248,8 @@ def download_safetensors_index_file_from_hf(
 # For models like Mistral-7B-v0.3, there are both sharded
 # safetensors files and a consolidated safetensors file.
 # Passing both of these to the weight loader functionality breaks.
-# So, we use the SAFE_WEIGHTS_INDEX_NAME
-# to look up which safetensors files should be used.
+# So, we use the SAFE_WEIGHTS_INDEX_NAME to 
+# look up which safetensors files should be used.
 def filter_duplicate_safetensors_files(hf_weights_files: List[str],
                                        hf_folder: str) -> List[str]:
     # model.safetensors.index.json is a mapping from keys in the

From 88896543c3825f3c8ccbb0362561e9b8d0e7941f Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 23 May 2024 21:13:38 +0000
Subject: [PATCH 14/17] Fix yapf

---
 vllm/model_executor/model_loader/weight_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index b271be6f48dcb..d5bf28b5336cd 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -248,7 +248,7 @@ def download_safetensors_index_file_from_hf(
 # For models like Mistral-7B-v0.3, there are both sharded
 # safetensors files and a consolidated safetensors file.
 # Passing both of these to the weight loader functionality breaks.
-# So, we use the SAFE_WEIGHTS_INDEX_NAME to 
+# So, we use the SAFE_WEIGHTS_INDEX_NAME to
 # look up which safetensors files should be used.
 def filter_duplicate_safetensors_files(hf_weights_files: List[str],
                                        hf_folder: str) -> List[str]:

From c76ae569edc2226700dbfab01c53a89741250ce1 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 24 May 2024 00:26:58 +0200
Subject: [PATCH 15/17] Update loader.py

---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 36ffeccd6da25..891028276238f 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -194,7 +194,7 @@ def _prepare_weights(self, model_name_or_path: str,
             # there are both sharded safetensors files and a consolidated
             # safetensors file. Using both in the weight_loader breaks.
             # Here, we download the `model.safetensors.index.json` and filter
-            # any safetensors files not found.
+            # any safetensors files not found in the index.
             if not is_local:
                 download_safetensors_index_file_from_hf(
                     model_name_or_path, self.load_config.download_dir,

From 05434723d8a3e1a7869b1660b0c4f4f2816f0f0f Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 24 May 2024 12:30:47 +0200
Subject: [PATCH 16/17] Update loader.py

---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 891028276238f..151d1f18bb714 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -192,7 +192,7 @@ def _prepare_weights(self, model_name_or_path: str,
         if use_safetensors:
             # For models like Mistral-7B-Instruct-v0.3
             # there are both sharded safetensors files and a consolidated
-            # safetensors file. Using both in the weight_loader breaks.
+            # safetensors file. Using both breaks.
             # Here, we download the `model.safetensors.index.json` and filter
             # any safetensors files not found in the index.
             if not is_local:

From ec79736619eb7ee0c145b92290374e5f9d40a7ba Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Fri, 24 May 2024 13:24:32 +0200
Subject: [PATCH 17/17] Update loader.py

---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 151d1f18bb714..b7b5b5e7695f4 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -194,7 +194,7 @@ def _prepare_weights(self, model_name_or_path: str,
             # there are both sharded safetensors files and a consolidated
             # safetensors file. Using both breaks.
             # Here, we download the `model.safetensors.index.json` and filter
-            # any safetensors files not found in the index.
+            # any files not found in the index.
             if not is_local:
                 download_safetensors_index_file_from_hf(
                     model_name_or_path, self.load_config.download_dir,