From 98943cf698d00fa865e0f9e9b66d0d4d2a9074cd Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 09:35:01 +0000 Subject: [PATCH 01/17] mistral patch --- vllm/model_executor/model_loader/loader.py | 13 ++++++--- .../model_loader/weight_utils.py | 28 +++++++++++++++++++ 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 45ea8160a801b..4036d37822f71 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1,6 +1,7 @@ # ruff: noqa: SIM117 import collections import copy +import json import glob import os from abc import ABC, abstractmethod @@ -24,13 +25,13 @@ set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_weights_from_hf, filter_files_not_needed_for_inference, - get_quant_config, initialize_dummy_weights, np_cache_weights_iterator, + filter_duplicate_safetensors_files, get_quant_config, + initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models.vlm_base import VisionLanguageModelBase logger = init_logger(__name__) - def _get_quantization_config( model_config: ModelConfig, load_config: LoadConfig) -> Optional[QuantizationConfig]: @@ -186,9 +187,13 @@ def _prepare_weights(self, model_name_or_path: str, if len(hf_weights_files) > 0: if pattern == "*.safetensors": use_safetensors = True + break - - if not use_safetensors: + + if use_safetensors: + hf_weights_files = filter_duplicate_safetensors_files( + hf_weights_files, hf_folder) + else: hf_weights_files = filter_files_not_needed_for_inference( hf_weights_files) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index a1642baa2c90c..c1a0d7a56ab6f 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -24,6 +24,8 @@ logger = init_logger(__name__) +_SAFETENSORS_INDEX_FILE_NAME = "model.safetensors.index.json" + # use system-level temp directory for file locks, so that multiple users # can share the same lock without error. # lock files in the temp directory will be automatically deleted when the @@ -195,6 +197,8 @@ def download_weights_from_hf( if len(matching) > 0: allow_patterns = [pattern] break + + allow_patterns.append("*model.safetensors.index.json") logger.info("Using model weights format %s", allow_patterns) # Use file lock to prevent multiple processes from @@ -210,6 +214,30 @@ def download_weights_from_hf( ) return hf_folder +# For models like Mistral-v0.3 (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) +# there are both sharded safetensors files and a consolidated safetensors file. +# Passing both of these to the weight loader functionality breaks. +# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json` to look up +# which safetensors files should be used. +def filter_duplicate_safetensors_files( + hf_weights_files: List[str], hf_folder: str) -> List[str]: + # model.safetensors.index.json is a mapping from keys in the + # torch state_dict to safetensors file holding that weight. + index_file = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME) + if not os.path.isfile(index_file): + return hf_weights_files + + # Iterate through the weight_map (weight_name: safetensors files) + # to identify weights that we should use. + weight_map = json.load(open(index_file))["weight_map"] + weight_files_in_index = set() + for weight_name in weight_map: + weight_files_in_index.add(os.path.join(hf_folder, + weight_map[weight_name])) + # Filter out any fields that are not found in the index file. + hf_weights_files = [f for f in + hf_weights_files if f in weight_files_in_index] + return hf_weights_files def filter_files_not_needed_for_inference( hf_weights_files: List[str]) -> List[str]: From bc7e431787501b73f6230239dcc5c045d95be2fd Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 10:25:34 +0000 Subject: [PATCH 02/17] cleaned up for upstream --- vllm/model_executor/model_loader/loader.py | 16 ++++++-- .../model_loader/weight_utils.py | 39 +++++++++++++++++-- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 4036d37822f71..11a08d060e4f0 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -24,8 +24,9 @@ from vllm.model_executor.model_loader.utils import (get_model_architecture, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( - download_weights_from_hf, filter_files_not_needed_for_inference, - filter_duplicate_safetensors_files, get_quant_config, + download_weights_from_hf, download_safetensors_index_file_from_hf, + filter_files_not_needed_for_inference, + filter_duplicate_safetensors_files, get_quant_config, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models.vlm_base import VisionLanguageModelBase @@ -164,6 +165,7 @@ def _prepare_weights(self, model_name_or_path: str, elif load_format == LoadFormat.SAFETENSORS: use_safetensors = True allow_patterns = ["*.safetensors"] + elif load_format == LoadFormat.PT: allow_patterns = ["*.pt"] elif load_format == LoadFormat.NPCACHE: @@ -187,12 +189,18 @@ def _prepare_weights(self, model_name_or_path: str, if len(hf_weights_files) > 0: if pattern == "*.safetensors": use_safetensors = True - break if use_safetensors: + # For models like Mistral-7B-Instruct-v0.3 + # there are both sharded safetensors files and a consolidated safetensors file. + # Passing both of these to the weight loader functionality breaks. + # Here, we download the `model.safetensors.index.json` and filter out + # any safetensors files not found. + download_safetensors_index_file_from_hf( + model_name_or_path, self.load_config.download_dir, revision) hf_weights_files = filter_duplicate_safetensors_files( - hf_weights_files, hf_folder) + hf_weights_files, hf_folder) else: hf_weights_files = filter_files_not_needed_for_inference( hf_weights_files) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index c1a0d7a56ab6f..74782fa7bc4a0 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -12,7 +12,7 @@ import huggingface_hub.constants import numpy as np import torch -from huggingface_hub import HfFileSystem, snapshot_download +from huggingface_hub import HfFileSystem, snapshot_download, hf_hub_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm @@ -198,8 +198,6 @@ def download_weights_from_hf( allow_patterns = [pattern] break - allow_patterns.append("*model.safetensors.index.json") - logger.info("Using model weights format %s", allow_patterns) # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. @@ -212,8 +210,43 @@ def download_weights_from_hf( revision=revision, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, ) + return hf_folder + +def download_safetensors_index_file_from_hf( + model_name_or_path: str, + cache_dir: Optional[str], + allow_patterns: List[str], + revision: Optional[str] = None, +) -> None: + """Download hf safetensors index file from Hugging Face Hub. + + Args: + model_name_or_path (str): The model name or path. + cache_dir (Optional[str]): The cache directory to store the model + weights. If None, will use HF defaults. + revision (Optional[str]): The revision of the model. + """ + # Use file lock to prevent multiple processes from + # downloading the same model weights at the same time. + with get_lock(model_name_or_path, cache_dir): + try: + # Download the safetensors index file. + _ = hf_hub_download( + repo_id=model_name_or_path, + filename=_SAFETENSORS_INDEX_FILE_NAME, + cache_dir=cache_dir, + revision=revision, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ) + # If file not found on remote or locally, we should not fail since + # only some models will have _SAFETENSORS_INDEX_FILE_NAME. + except huggingface_hub.utils.EntryNotFoundError: + logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in remote.") + except huggingface_hub.utils.LocalEntryNotFoundError: + logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in local cache.") + # For models like Mistral-v0.3 (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) # there are both sharded safetensors files and a consolidated safetensors file. # Passing both of these to the weight loader functionality breaks. From 9ebd6d469f1823f09a59ebb2cb7dcb281c2a9882 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 10:34:33 +0000 Subject: [PATCH 03/17] format --- vllm/model_executor/model_loader/loader.py | 17 ++++--- .../model_loader/weight_utils.py | 44 ++++++++++--------- 2 files changed, 32 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 11a08d060e4f0..18da9e58dcde7 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1,7 +1,6 @@ # ruff: noqa: SIM117 import collections import copy -import json import glob import os from abc import ABC, abstractmethod @@ -24,15 +23,15 @@ from vllm.model_executor.model_loader.utils import (get_model_architecture, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( - download_weights_from_hf, download_safetensors_index_file_from_hf, - filter_files_not_needed_for_inference, - filter_duplicate_safetensors_files, get_quant_config, - initialize_dummy_weights, np_cache_weights_iterator, + download_safetensors_index_file_from_hf, download_weights_from_hf, + filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, + get_quant_config, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models.vlm_base import VisionLanguageModelBase logger = init_logger(__name__) + def _get_quantization_config( model_config: ModelConfig, load_config: LoadConfig) -> Optional[QuantizationConfig]: @@ -190,12 +189,12 @@ def _prepare_weights(self, model_name_or_path: str, if pattern == "*.safetensors": use_safetensors = True break - + if use_safetensors: # For models like Mistral-7B-Instruct-v0.3 - # there are both sharded safetensors files and a consolidated safetensors file. - # Passing both of these to the weight loader functionality breaks. - # Here, we download the `model.safetensors.index.json` and filter out + # there are both sharded safetensors files and a consolidated + # safetensors file. Using both in the weight_loader breaks. + # Here, we download the `model.safetensors.index.json` and filter # any safetensors files not found. download_safetensors_index_file_from_hf( model_name_or_path, self.load_config.download_dir, revision) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 74782fa7bc4a0..8bdf536305556 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -12,7 +12,7 @@ import huggingface_hub.constants import numpy as np import torch -from huggingface_hub import HfFileSystem, snapshot_download, hf_hub_download +from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm @@ -197,7 +197,7 @@ def download_weights_from_hf( if len(matching) > 0: allow_patterns = [pattern] break - + logger.info("Using model weights format %s", allow_patterns) # Use file lock to prevent multiple processes from # downloading the same model weights at the same time. @@ -210,14 +210,13 @@ def download_weights_from_hf( revision=revision, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, ) - + return hf_folder def download_safetensors_index_file_from_hf( model_name_or_path: str, cache_dir: Optional[str], - allow_patterns: List[str], revision: Optional[str] = None, ) -> None: """Download hf safetensors index file from Hugging Face Hub. @@ -243,35 +242,40 @@ def download_safetensors_index_file_from_hf( # If file not found on remote or locally, we should not fail since # only some models will have _SAFETENSORS_INDEX_FILE_NAME. except huggingface_hub.utils.EntryNotFoundError: - logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in remote.") + logger.info("No %s found in remote.", _SAFETENSORS_INDEX_FILE_NAME) except huggingface_hub.utils.LocalEntryNotFoundError: - logger.info(f"No {_SAFETENSORS_INDEX_FILE_NAME} found in local cache.") + logger.info("No %s found in local cache.", + _SAFETENSORS_INDEX_FILE_NAME) + -# For models like Mistral-v0.3 (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) +# For models like Mistral-7B-v0.3 # there are both sharded safetensors files and a consolidated safetensors file. # Passing both of these to the weight loader functionality breaks. -# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json` to look up -# which safetensors files should be used. -def filter_duplicate_safetensors_files( - hf_weights_files: List[str], hf_folder: str) -> List[str]: - # model.safetensors.index.json is a mapping from keys in the +# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json` +# to look up which safetensors files should be used. +def filter_duplicate_safetensors_files(hf_weights_files: List[str], + hf_folder: str) -> List[str]: + # model.safetensors.index.json is a mapping from keys in the # torch state_dict to safetensors file holding that weight. - index_file = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME) - if not os.path.isfile(index_file): + index_file_name = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME) + if not os.path.isfile(index_file_name): return hf_weights_files - + # Iterate through the weight_map (weight_name: safetensors files) # to identify weights that we should use. - weight_map = json.load(open(index_file))["weight_map"] + with open(index_file_name) as index_file: + weight_map = json.load(index_file)["weight_map"] weight_files_in_index = set() for weight_name in weight_map: - weight_files_in_index.add(os.path.join(hf_folder, - weight_map[weight_name])) + weight_files_in_index.add( + os.path.join(hf_folder, weight_map[weight_name])) # Filter out any fields that are not found in the index file. - hf_weights_files = [f for f in - hf_weights_files if f in weight_files_in_index] + hf_weights_files = [ + f for f in hf_weights_files if f in weight_files_in_index + ] return hf_weights_files + def filter_files_not_needed_for_inference( hf_weights_files: List[str]) -> List[str]: """ From f98f7227544f055b95277fe56757f55612d04d30 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 10:38:31 +0000 Subject: [PATCH 04/17] added test --- tests/models/test_mistral.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index d0a5bfbfcd922..76b248cf14e98 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -8,6 +8,7 @@ MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", + "mistralai/Mistral-7B-Instruct-v0.3", ] From 56942e6c87093892293f13538c2019622a907c06 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 10:50:02 +0000 Subject: [PATCH 05/17] updated to use transformers const --- vllm/model_executor/model_loader/weight_utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 8bdf536305556..2dd9d0d0d036e 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -12,6 +12,7 @@ import huggingface_hub.constants import numpy as np import torch +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm @@ -24,8 +25,6 @@ logger = init_logger(__name__) -_SAFETENSORS_INDEX_FILE_NAME = "model.safetensors.index.json" - # use system-level temp directory for file locks, so that multiple users # can share the same lock without error. # lock files in the temp directory will be automatically deleted when the @@ -234,30 +233,30 @@ def download_safetensors_index_file_from_hf( # Download the safetensors index file. _ = hf_hub_download( repo_id=model_name_or_path, - filename=_SAFETENSORS_INDEX_FILE_NAME, + filename=SAFE_WEIGHTS_INDEX_NAME, cache_dir=cache_dir, revision=revision, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, ) # If file not found on remote or locally, we should not fail since - # only some models will have _SAFETENSORS_INDEX_FILE_NAME. + # only some models will have SAFE_WEIGHTS_INDEX_NAME. except huggingface_hub.utils.EntryNotFoundError: - logger.info("No %s found in remote.", _SAFETENSORS_INDEX_FILE_NAME) + logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME) except huggingface_hub.utils.LocalEntryNotFoundError: logger.info("No %s found in local cache.", - _SAFETENSORS_INDEX_FILE_NAME) + SAFE_WEIGHTS_INDEX_NAME) # For models like Mistral-7B-v0.3 # there are both sharded safetensors files and a consolidated safetensors file. # Passing both of these to the weight loader functionality breaks. -# So, we use the _SAFETENSORS_INDEX_FILE `model.safetensors.index.json` +# So, we use the SAFE_WEIGHTS_INDEX_NAME # to look up which safetensors files should be used. def filter_duplicate_safetensors_files(hf_weights_files: List[str], hf_folder: str) -> List[str]: # model.safetensors.index.json is a mapping from keys in the # torch state_dict to safetensors file holding that weight. - index_file_name = os.path.join(hf_folder, _SAFETENSORS_INDEX_FILE_NAME) + index_file_name = os.path.join(hf_folder, SAFE_WEIGHTS_INDEX_NAME) if not os.path.isfile(index_file_name): return hf_weights_files From 5f5d057b4c429846f2b89c8a70f829be7a98c661 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 10:50:43 +0000 Subject: [PATCH 06/17] format --- vllm/model_executor/model_loader/weight_utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 2dd9d0d0d036e..7838ea4ab5715 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -12,10 +12,10 @@ import huggingface_hub.constants import numpy as np import torch -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm.config import LoadConfig, ModelConfig from vllm.logger import init_logger @@ -243,8 +243,7 @@ def download_safetensors_index_file_from_hf( except huggingface_hub.utils.EntryNotFoundError: logger.info("No %s found in remote.", SAFE_WEIGHTS_INDEX_NAME) except huggingface_hub.utils.LocalEntryNotFoundError: - logger.info("No %s found in local cache.", - SAFE_WEIGHTS_INDEX_NAME) + logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME) # For models like Mistral-7B-v0.3 From 389d025edaab048c60fa993d4f01f742926bb308 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 10:52:58 +0000 Subject: [PATCH 07/17] cleanup stray newline --- vllm/model_executor/model_loader/loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 18da9e58dcde7..de512d3b60fcf 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -164,7 +164,6 @@ def _prepare_weights(self, model_name_or_path: str, elif load_format == LoadFormat.SAFETENSORS: use_safetensors = True allow_patterns = ["*.safetensors"] - elif load_format == LoadFormat.PT: allow_patterns = ["*.pt"] elif load_format == LoadFormat.NPCACHE: From 6c44a4797c41d9bd42d7ffca6d9f8afffb912892 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 23 May 2024 17:19:56 +0200 Subject: [PATCH 08/17] Update vllm/model_executor/model_loader/weight_utils.py Co-authored-by: Cody Yu --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 7838ea4ab5715..b0a3f46e70ed7 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -231,7 +231,7 @@ def download_safetensors_index_file_from_hf( with get_lock(model_name_or_path, cache_dir): try: # Download the safetensors index file. - _ = hf_hub_download( + hf_hub_download( repo_id=model_name_or_path, filename=SAFE_WEIGHTS_INDEX_NAME, cache_dir=cache_dir, From 23044da8b11f4cced1ed3ca792e0dcc94a82c35b Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 23 May 2024 17:21:18 +0200 Subject: [PATCH 09/17] Update weight_utils.py --- vllm/model_executor/model_loader/weight_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index b0a3f46e70ed7..16f587be4972f 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -209,7 +209,6 @@ def download_weights_from_hf( revision=revision, local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, ) - return hf_folder From a61eee275f8ffcd35f86dae61ba0c89bb69fa6ba Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 23 May 2024 19:35:55 +0200 Subject: [PATCH 10/17] Update weight_utils.py --- vllm/model_executor/model_loader/weight_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 16f587be4972f..f6087be67f676 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -245,8 +245,8 @@ def download_safetensors_index_file_from_hf( logger.info("No %s found in local cache.", SAFE_WEIGHTS_INDEX_NAME) -# For models like Mistral-7B-v0.3 -# there are both sharded safetensors files and a consolidated safetensors file. +# For models like Mistral-7B-v0.3, there are both sharded +# safetensors files and a consolidated safetensors file. # Passing both of these to the weight loader functionality breaks. # So, we use the SAFE_WEIGHTS_INDEX_NAME # to look up which safetensors files should be used. From 97c0b2ff206fa2aa0204460ebc5f7ad68fa70f48 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 20:20:37 +0000 Subject: [PATCH 11/17] updated --- vllm/model_executor/model_loader/loader.py | 6 ++++-- vllm/model_executor/model_loader/weight_utils.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index de512d3b60fcf..36ffeccd6da25 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -195,8 +195,10 @@ def _prepare_weights(self, model_name_or_path: str, # safetensors file. Using both in the weight_loader breaks. # Here, we download the `model.safetensors.index.json` and filter # any safetensors files not found. - download_safetensors_index_file_from_hf( - model_name_or_path, self.load_config.download_dir, revision) + if not is_local: + download_safetensors_index_file_from_hf( + model_name_or_path, self.load_config.download_dir, + revision) hf_weights_files = filter_duplicate_safetensors_files( hf_weights_files, hf_folder) else: diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f6087be67f676..b295242f29837 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -12,7 +12,8 @@ import huggingface_hub.constants import numpy as np import torch -from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download +from huggingface_hub import (HfFileSystem, hf_hub_download, snapshot_download, + repo_exists) from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm from transformers.utils import SAFE_WEIGHTS_INDEX_NAME From 8f9c79291287b896565e84e3159ffaafb59ec138 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Thu, 23 May 2024 20:22:24 +0000 Subject: [PATCH 12/17] updated --- vllm/model_executor/model_loader/weight_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index b295242f29837..f6087be67f676 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -12,8 +12,7 @@ import huggingface_hub.constants import numpy as np import torch -from huggingface_hub import (HfFileSystem, hf_hub_download, snapshot_download, - repo_exists) +from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download from safetensors.torch import load_file, safe_open, save_file from tqdm.auto import tqdm from transformers.utils import SAFE_WEIGHTS_INDEX_NAME From cf5bc213eb01743339448156b447677350e5c4e0 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 23 May 2024 23:05:19 +0200 Subject: [PATCH 13/17] Update weight_utils.py update to re-run CI --- vllm/model_executor/model_loader/weight_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f6087be67f676..b271be6f48dcb 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -248,8 +248,8 @@ def download_safetensors_index_file_from_hf( # For models like Mistral-7B-v0.3, there are both sharded # safetensors files and a consolidated safetensors file. # Passing both of these to the weight loader functionality breaks. -# So, we use the SAFE_WEIGHTS_INDEX_NAME -# to look up which safetensors files should be used. +# So, we use the SAFE_WEIGHTS_INDEX_NAME to +# look up which safetensors files should be used. def filter_duplicate_safetensors_files(hf_weights_files: List[str], hf_folder: str) -> List[str]: # model.safetensors.index.json is a mapping from keys in the From 88896543c3825f3c8ccbb0362561e9b8d0e7941f Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 23 May 2024 21:13:38 +0000 Subject: [PATCH 14/17] Fix yapf --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index b271be6f48dcb..d5bf28b5336cd 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -248,7 +248,7 @@ def download_safetensors_index_file_from_hf( # For models like Mistral-7B-v0.3, there are both sharded # safetensors files and a consolidated safetensors file. # Passing both of these to the weight loader functionality breaks. -# So, we use the SAFE_WEIGHTS_INDEX_NAME to +# So, we use the SAFE_WEIGHTS_INDEX_NAME to # look up which safetensors files should be used. def filter_duplicate_safetensors_files(hf_weights_files: List[str], hf_folder: str) -> List[str]: From c76ae569edc2226700dbfab01c53a89741250ce1 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 24 May 2024 00:26:58 +0200 Subject: [PATCH 15/17] Update loader.py --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 36ffeccd6da25..891028276238f 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -194,7 +194,7 @@ def _prepare_weights(self, model_name_or_path: str, # there are both sharded safetensors files and a consolidated # safetensors file. Using both in the weight_loader breaks. # Here, we download the `model.safetensors.index.json` and filter - # any safetensors files not found. + # any safetensors files not found in the index. if not is_local: download_safetensors_index_file_from_hf( model_name_or_path, self.load_config.download_dir, From 05434723d8a3e1a7869b1660b0c4f4f2816f0f0f Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 24 May 2024 12:30:47 +0200 Subject: [PATCH 16/17] Update loader.py --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 891028276238f..151d1f18bb714 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -192,7 +192,7 @@ def _prepare_weights(self, model_name_or_path: str, if use_safetensors: # For models like Mistral-7B-Instruct-v0.3 # there are both sharded safetensors files and a consolidated - # safetensors file. Using both in the weight_loader breaks. + # safetensors file. Using both breaks. # Here, we download the `model.safetensors.index.json` and filter # any safetensors files not found in the index. if not is_local: From ec79736619eb7ee0c145b92290374e5f9d40a7ba Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 24 May 2024 13:24:32 +0200 Subject: [PATCH 17/17] Update loader.py --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 151d1f18bb714..b7b5b5e7695f4 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -194,7 +194,7 @@ def _prepare_weights(self, model_name_or_path: str, # there are both sharded safetensors files and a consolidated # safetensors file. Using both breaks. # Here, we download the `model.safetensors.index.json` and filter - # any safetensors files not found in the index. + # any files not found in the index. if not is_local: download_safetensors_index_file_from_hf( model_name_or_path, self.load_config.download_dir,