diff --git a/vllm/utils.py b/vllm/utils.py index 729a4332af967..d4a8c962c3bfc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -4,7 +4,6 @@ import subprocess import uuid import gc -from functools import cache from platform import uname from typing import List, Tuple, Union from packaging.version import parse, Version @@ -121,7 +120,6 @@ def is_hip() -> bool: return torch.version.hip is not None -@cache def is_neuron() -> bool: try: import transformers_neuronx @@ -130,7 +128,6 @@ def is_neuron() -> bool: return transformers_neuronx is not None -@cache def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" # NOTE: This import statement should be executed lazily since @@ -154,7 +151,6 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -@cache def in_wsl() -> bool: # Reference: https://github.com/microsoft/WSL/issues/4071 return "microsoft" in " ".join(uname()).lower() @@ -229,7 +225,6 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None: os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids)) -@cache def get_nvcc_cuda_version() -> Optional[Version]: cuda_home = os.environ.get('CUDA_HOME') if not cuda_home: