From e3a60a43045c3a232a0d0d40f4db13b8f7f46e85 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 10 Feb 2024 02:59:13 +0000 Subject: [PATCH 01/76] cmake based build system --- pyproject.toml | 1 + requirements-build.txt | 4 +- requirements-rocm.txt | 1 + requirements.txt | 1 + setup.py | 468 ++++++++++++++--------------------------- 5 files changed, 165 insertions(+), 310 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e0a01215ef997..b6d7649477dcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,7 @@ [build-system] # Should be mirrored in requirements-build.txt requires = [ + "cmake>=3.21", "ninja", "packaging", "setuptools >= 49.4.0", diff --git a/requirements-build.txt b/requirements-build.txt index 7e7e48a1313e5..8975f477fe96c 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,6 +1,6 @@ -# Should be mirrored in pyproject.toml +cmake>=3.21 ninja packaging setuptools>=49.4.0 torch==2.1.2 -wheel \ No newline at end of file +wheel diff --git a/requirements-rocm.txt b/requirements-rocm.txt index d5a3bd423b6b3..c30479e40f521 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -1,3 +1,4 @@ +cmake>=3.21 ninja # For faster builds. typing-extensions>=4.8.0 starlette diff --git a/requirements.txt b/requirements.txt index d6c33ad85da58..c9a5bd6619402 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +cmake>=3.21 ninja # For faster builds. psutil ray >= 2.9 diff --git a/setup.py b/setup.py index 6f1f2faf54dbc..3ce85f2f9b35b 100644 --- a/setup.py +++ b/setup.py @@ -1,23 +1,17 @@ -import contextlib import io import os import re import subprocess -import warnings -from pathlib import Path -from typing import List, Set +import sys +from typing import List from packaging.version import parse, Version -import setuptools -import sys +from setuptools import setup, find_packages, Extension +from setuptools.command.build_ext import build_ext +from shutil import which import torch -import torch.utils.cpp_extension as torch_cpp_ext -from torch.utils.cpp_extension import ( - BuildExtension, - CUDAExtension, - CUDA_HOME, - ROCM_HOME, -) +# ROCM_HOME needed? +from torch.utils.cpp_extension import CUDA_HOME ROOT_DIR = os.path.dirname(__file__) @@ -32,10 +26,109 @@ MAIN_CUDA_VERSION = "12.1" -# Supported NVIDIA GPU architectures. -NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx942", "gfx1100"} -# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) + +def is_ccache_available() -> bool: + return which("ccacheX") is not None + + +def is_ninja_available() -> bool: + return which("ninja") is not None + + +def remove_prefix(text, prefix): + if text.startswith(prefix): + return text[len(prefix):] + return text + + +class CMakeExtension(Extension): + + def __init__(self, name, cmake_lists_dir='.', **kwa): + Extension.__init__(self, name, sources=[], **kwa) + self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) + + +class cmake_build_ext(build_ext): + + def build_extensions(self): + # Ensure that CMake is present and working + try: + subprocess.check_output(['cmake', '--version']) + except OSError as e: + raise RuntimeError('Cannot find CMake executable') from e + + for ext in self.extensions: + + extdir = os.path.abspath( + os.path.dirname(self.get_ext_fullpath(ext.name))) + + # Note: optimization level + debug info set by the build type + cfg = os.getenv("VLLM_BUILD_TYPE", "RelWithDebInfo") + + cmake_args = [ + '-DCMAKE_BUILD_TYPE=%s' % cfg, + # Ask CMake to place the resulting library in the directory + # containing the extension + '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format( + cfg.upper(), extdir), + # Other intermediate static libraries are placed in a + # temporary build directory instead + '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format( + cfg.upper(), self.build_temp), + # Hint CMake to use the same Python executable that + # is launching the build, prevents possible mismatching if + # multiple versions of Python are installed + '-DPYTHON_EXECUTABLE={}'.format(sys.executable), + ] + + # TODO: change default to 0 + verbose = bool(int(os.getenv('VERBOSE', '1'))) + if verbose: + cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] + + if is_ccache_available(): + cmake_args += [ + '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', + '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', + ] + + # + # Setup parallelism + # + num_jobs = os.cpu_count() + nvcc_cuda_version = get_nvcc_cuda_version() + if nvcc_cuda_version >= Version("11.2"): + nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) + num_jobs = max(1, round(num_jobs / (nvcc_threads / 4))) + cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)] + + if not os.path.exists(self.build_temp): + os.makedirs(self.build_temp) + + ext_target_name = remove_prefix(ext.name, "vllm.") + + if is_ninja_available(): + build_tool = ['-G', 'Ninja'] + cmake_args += [ + '-DCMAKE_JOB_POOL_COMPILE:STRING=compile', + '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs), + ] + build_jobs = [] + else: + build_tool = ['-G', 'Unix Makefiles'] + build_jobs = ['-j', str(num_jobs)] + + # Config + subprocess.check_call(['cmake', ext.cmake_lists_dir] + build_tool + + cmake_args, + cwd=self.build_temp) + + # Build + build_args = [ + '--build', '.', '--config', cfg, '--target', ext_target_name + ] + subprocess.check_call(['cmake'] + build_args + build_jobs, + cwd=self.build_temp) def _is_cuda() -> bool: @@ -55,26 +148,36 @@ def _is_neuron() -> bool: return torch_neuronx_installed -# Compiler flags. -CXX_FLAGS = ["-g", "-O2", "-std=c++17"] -# TODO(woosuk): Should we use -O3? -NVCC_FLAGS = ["-O2", "-std=c++17"] +def _is_cuda() -> bool: + return (torch.version.cuda is not None) and not _is_neuron() + + +def _install_punica() -> bool: + install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) + device_count = torch.cuda.device_count() + for i in range(device_count): + major, minor = torch.cuda.get_device_capability(i) + if major < 8: + install_punica = False + break + return install_punica -if _is_hip(): - if ROCM_HOME is None: - raise RuntimeError("Cannot find ROCM_HOME. " - "ROCm must be available to build the package.") - NVCC_FLAGS += ["-DUSE_ROCM"] - NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"] - NVCC_FLAGS += ["-U__HIP_NO_HALF_OPERATORS__"] -if _is_cuda() and CUDA_HOME is None: - raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package.") +def get_path(*filepath) -> str: + return os.path.join(ROOT_DIR, *filepath) -ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 -CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] + +def find_version(filepath: str) -> str: + """Extract version information from the given filepath. + + Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py + """ + with open(filepath) as fp: + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", + fp.read(), re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") def get_hipcc_rocm_version(): @@ -99,11 +202,6 @@ def get_hipcc_rocm_version(): return None -def glob(pattern: str): - root = Path(__name__).parent - return [str(p) for p in root.glob(pattern)] - - def get_neuronxcc_version(): import sysconfig site_dir = sysconfig.get_paths()["purelib"] @@ -123,12 +221,12 @@ def get_neuronxcc_version(): raise RuntimeError("Could not find HIP version in the output") -def get_nvcc_cuda_version(cuda_dir: str) -> Version: +def get_nvcc_cuda_version() -> Version: """Get the CUDA version from nvcc. Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py """ - nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], + nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True) output = nvcc_output.split() release_idx = output.index("release") + 1 @@ -136,267 +234,6 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version: return nvcc_cuda_version -def get_pytorch_rocm_arch() -> Set[str]: - """Get the cross section of Pytorch,and vllm supported gfx arches - - ROCM can get the supported gfx architectures in one of two ways - Either through the PYTORCH_ROCM_ARCH env var, or output from - rocm_agent_enumerator. - - In either case we can generate a list of supported arch's and - cross reference with VLLM's own ROCM_SUPPORTED_ARCHs. - """ - env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None) - - # If we don't have PYTORCH_ROCM_ARCH specified pull the list from - # rocm_agent_enumerator - if env_arch_list is None: - command = "rocm_agent_enumerator" - env_arch_list = (subprocess.check_output( - [command]).decode('utf-8').strip().replace("\n", ";")) - arch_source_str = "rocm_agent_enumerator" - else: - arch_source_str = "PYTORCH_ROCM_ARCH env variable" - - # List are separated by ; or space. - pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";")) - - # Filter out the invalid architectures and print a warning. - arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS) - - # If none of the specified architectures are valid, raise an error. - if not arch_list: - raise RuntimeError( - f"None of the ROCM architectures in {arch_source_str} " - f"({env_arch_list}) is supported. " - f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.") - invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS - if invalid_arch_list: - warnings.warn( - f"Unsupported ROCM architectures ({invalid_arch_list}) are " - f"excluded from the {arch_source_str} output " - f"({env_arch_list}). Supported ROCM architectures are: " - f"{ROCM_SUPPORTED_ARCHS}.", - stacklevel=2) - return arch_list - - -def get_torch_arch_list() -> Set[str]: - # TORCH_CUDA_ARCH_LIST can have one or more architectures, - # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the - # compiler to additionally include PTX code that can be runtime-compiled - # and executed on the 8.6 or newer architectures. While the PTX code will - # not give the best performance on the newer architectures, it provides - # forward compatibility. - env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) - if env_arch_list is None: - return set() - - # List are separated by ; or space. - torch_arch_list = set(env_arch_list.replace(" ", ";").split(";")) - if not torch_arch_list: - return set() - - # Filter out the invalid architectures and print a warning. - valid_archs = NVIDIA_SUPPORTED_ARCHS.union( - {s + "+PTX" - for s in NVIDIA_SUPPORTED_ARCHS}) - arch_list = torch_arch_list.intersection(valid_archs) - # If none of the specified architectures are valid, raise an error. - if not arch_list: - raise RuntimeError( - "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env " - f"variable ({env_arch_list}) is supported. " - f"Supported CUDA architectures are: {valid_archs}.") - invalid_arch_list = torch_arch_list - valid_archs - if invalid_arch_list: - warnings.warn( - f"Unsupported CUDA architectures ({invalid_arch_list}) are " - "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " - f"({env_arch_list}). Supported CUDA architectures are: " - f"{valid_archs}.", - stacklevel=2) - return arch_list - - -if _is_hip(): - rocm_arches = get_pytorch_rocm_arch() - NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches] -else: - # First, check the TORCH_CUDA_ARCH_LIST environment variable. - compute_capabilities = get_torch_arch_list() - -if _is_cuda() and not compute_capabilities: - # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available - # GPUs on the current machine. - device_count = torch.cuda.device_count() - for i in range(device_count): - major, minor = torch.cuda.get_device_capability(i) - if major < 7: - raise RuntimeError( - "GPUs with compute capability below 7.0 are not supported.") - compute_capabilities.add(f"{major}.{minor}") - -ext_modules = [] - -if _is_cuda(): - nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) - if not compute_capabilities: - # If no GPU is specified nor available, add all supported architectures - # based on the NVCC CUDA version. - compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() - if nvcc_cuda_version < Version("11.1"): - compute_capabilities.remove("8.6") - if nvcc_cuda_version < Version("11.8"): - compute_capabilities.remove("8.9") - compute_capabilities.remove("9.0") - # Validate the NVCC CUDA version. - if nvcc_cuda_version < Version("11.0"): - raise RuntimeError( - "CUDA 11.0 or higher is required to build the package.") - if (nvcc_cuda_version < Version("11.1") - and any(cc.startswith("8.6") for cc in compute_capabilities)): - raise RuntimeError( - "CUDA 11.1 or higher is required for compute capability 8.6.") - if nvcc_cuda_version < Version("11.8"): - if any(cc.startswith("8.9") for cc in compute_capabilities): - # CUDA 11.8 is required to generate the code targeting compute - # capability 8.9. However, GPUs with compute capability 8.9 can - # also run the code generated by the previous versions of CUDA 11 - # and targeting compute capability 8.0. Therefore, if CUDA 11.8 - # is not available, we target compute capability 8.0 instead of 8.9. - warnings.warn( - "CUDA 11.8 or higher is required for compute capability 8.9. " - "Targeting compute capability 8.0 instead.", - stacklevel=2) - compute_capabilities = set(cc for cc in compute_capabilities - if not cc.startswith("8.9")) - compute_capabilities.add("8.0+PTX") - if any(cc.startswith("9.0") for cc in compute_capabilities): - raise RuntimeError( - "CUDA 11.8 or higher is required for compute capability 9.0.") - - NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy() - - # Add target compute capabilities to NVCC flags. - for capability in compute_capabilities: - num = capability[0] + capability[2] - NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] - if capability.endswith("+PTX"): - NVCC_FLAGS += [ - "-gencode", f"arch=compute_{num},code=compute_{num}" - ] - if int(capability[0]) >= 8: - NVCC_FLAGS_PUNICA += [ - "-gencode", f"arch=compute_{num},code=sm_{num}" - ] - if capability.endswith("+PTX"): - NVCC_FLAGS_PUNICA += [ - "-gencode", f"arch=compute_{num},code=compute_{num}" - ] - - # Use NVCC threads to parallelize the build. - if nvcc_cuda_version >= Version("11.2"): - nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) - num_threads = min(os.cpu_count(), nvcc_threads) - NVCC_FLAGS += ["--threads", str(num_threads)] - - if nvcc_cuda_version >= Version("11.8"): - NVCC_FLAGS += ["-DENABLE_FP8_E5M2"] - - # changes for punica kernels - NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS - REMOVE_NVCC_FLAGS = [ - '-D__CUDA_NO_HALF_OPERATORS__', - '-D__CUDA_NO_HALF_CONVERSIONS__', - '-D__CUDA_NO_BFLOAT16_CONVERSIONS__', - '-D__CUDA_NO_HALF2_OPERATORS__', - ] - for flag in REMOVE_NVCC_FLAGS: - with contextlib.suppress(ValueError): - torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag) - - install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) - device_count = torch.cuda.device_count() - for i in range(device_count): - major, minor = torch.cuda.get_device_capability(i) - if major < 8: - install_punica = False - break - if install_punica: - ext_modules.append( - CUDAExtension( - name="vllm._punica_C", - sources=["csrc/punica/punica_ops.cc"] + - glob("csrc/punica/bgmv/*.cu"), - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS_PUNICA, - }, - )) -elif _is_neuron(): - neuronxcc_version = get_neuronxcc_version() - -vllm_extension_sources = [ - "csrc/cache_kernels.cu", - "csrc/attention/attention_kernels.cu", - "csrc/pos_encoding_kernels.cu", - "csrc/activation_kernels.cu", - "csrc/layernorm_kernels.cu", - "csrc/quantization/squeezellm/quant_cuda_kernel.cu", - "csrc/quantization/gptq/q_gemm.cu", - "csrc/cuda_utils_kernels.cu", - "csrc/moe_align_block_size_kernels.cu", - "csrc/pybind.cpp", -] - -if _is_cuda(): - vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") - vllm_extension_sources.append( - "csrc/quantization/marlin/marlin_cuda_kernel.cu") - vllm_extension_sources.append("csrc/custom_all_reduce.cu") - - # Add MoE kernels. - ext_modules.append( - CUDAExtension( - name="vllm._moe_C", - sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"), - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, - )) - -if not _is_neuron(): - vllm_extension = CUDAExtension( - name="vllm._C", - sources=vllm_extension_sources, - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, - libraries=["cuda"] if _is_cuda() else [], - ) - ext_modules.append(vllm_extension) - - -def get_path(*filepath) -> str: - return os.path.join(ROOT_DIR, *filepath) - - -def find_version(filepath: str) -> str: - """Extract version information from the given filepath. - - Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py - """ - with open(filepath) as fp: - version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", - fp.read(), re.M) - if version_match: - return version_match.group(1) - raise RuntimeError("Unable to find version string.") - - def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) @@ -413,10 +250,15 @@ def get_vllm_version() -> str: version += f"+rocm{rocm_version_str}" elif _is_neuron(): # Get the Neuron version - neuron_version = str(neuronxcc_version) + neuron_version = str(get_neuronxcc_version()) if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"+neuron{neuron_version_str}" + elif _is_cuda(): + cuda_version = str(get_nvcc_cuda_version()) + if cuda_version != MAIN_CUDA_VERSION: + cuda_version_str = cuda_version.replace(".", "")[:3] + version += f"+cu{cuda_version_str}" else: raise RuntimeError("Unknown runtime environment") @@ -456,14 +298,24 @@ def get_requirements() -> List[str]: return requirements +ext_modules = [] + +if _is_cuda(): + ext_modules.append(CMakeExtension(name="vllm._moe_C")) + + if _install_punica(): + ext_modules.append(CMakeExtension(name="vllm._punica_C")) + +if not _is_neuron(): + ext_modules.append(CMakeExtension(name="vllm._C")) + package_data = { "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] } if os.environ.get("VLLM_USE_PRECOMPILED"): - ext_modules = [] package_data["vllm"].append("*.so") -setuptools.setup( +setup( name="vllm", version=get_vllm_version(), author="vLLM Team", @@ -485,11 +337,11 @@ def get_requirements() -> List[str]: "License :: OSI Approved :: Apache Software License", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs", - "examples", "tests")), + packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", + "tests")), python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, - cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {}, + cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {}, package_data=package_data, ) From b500bb44da85f44dad3a215f12c7a00536e64b64 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 10 Feb 2024 03:07:18 +0000 Subject: [PATCH 02/76] comment out newer bits --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3ce85f2f9b35b..e955c70d80c19 100644 --- a/setup.py +++ b/setup.py @@ -301,7 +301,7 @@ def get_requirements() -> List[str]: ext_modules = [] if _is_cuda(): - ext_modules.append(CMakeExtension(name="vllm._moe_C")) +# ext_modules.append(CMakeExtension(name="vllm._moe_C")) if _install_punica(): ext_modules.append(CMakeExtension(name="vllm._punica_C")) From cc2407d2e31efed8c21e4315c0472e667edd9831 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 02:58:00 +0000 Subject: [PATCH 03/76] merge --- CMakeLists.txt | 156 +++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 8 +-- 2 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000000..9b78770ec7960 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,156 @@ +cmake_minimum_required(VERSION 3.21) + +project(vllm_extensions LANGUAGES CXX) + +# +# Find where user site-packages are installed and add it to cmake's search path. +# + +if(NOT DEFINED PYTHON_EXECUTABLE) + set(PYTHON_EXECUTABLE python3) +endif() + +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "-c" + "import site; print(site.getusersitepackages())" + OUTPUT_VARIABLE SITE_PATH + ERROR_VARIABLE SITE_PATH_ERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(SITE_PATH STREQUAL "") + message(FATAL_ERROR "Failed to locate site-packages path," + " full error message:\n${SITE_PATH_ERR}") +endif() + +list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH}) + +# +# Find packages needed to compile +# +find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) +find_package(Torch 2.1.2 EXACT REQUIRED) +append_torchlib_if_found(torch_python) +find_package(MPI REQUIRED) + +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "-c" + "import torch.utils.cpp_extension as torch_cpp_ext; print(' '.join(torch_cpp_ext.COMMON_NVCC_FLAGS))" + OUTPUT_VARIABLE TORCH_NVCC_FLAGS + ERROR_VARIABLE TORCH_NVCC_FLAGS_ERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(TORCH_NVCC_FLAGS STREQUAL "") + message(FATAL_ERROR "Unable to determine torch nvcc compiler flags," + " full error message:\n${TORCH_NVCC_FLAGS_ERR}") +endif() + +string(STRIP ${TORCH_NVCC_FLAGS} TORCH_NVCC_FLAGS) +list(APPEND NVCC_FLAGS ${TORCH_NVCC_FLAGS}) + +set(PUNICA_NVCC_FLAGS "${NVCC_FLAGS}") +foreach(OPT + "-D__CUDA_NO_HALF_OPERATORS__" + "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" + "-D__CUDA_NO_HALF2_OPERATORS__" + ) + string(REPLACE ${OPT} "" PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS}) +endforeach() +string(STRIP ${PUNICA_NVCC_FLAGS} PUNICA_NVCC_FLAGS) + +if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) + list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2") +endif() + +# +# Check for existence of CUDA/HIP language support +# +# https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html +include(CheckLanguage) +check_language(HIP) +check_language(CUDA) + +if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND") + enable_language(HIP) + list(APPEND NVCC_FLAGS "-DUSE_ROCM -U__HIP_NO_HALF_CONVERSIONS__ -U__HIP_NO_HALF_OPERATORS__") + + # TODO: intersect with this list? + if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942") + endif() + + foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) + list(APPEND NVCC_FLAGS "--offload-arch=${HIP_ARCH}") + endforeach() +elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND") + enable_language(CUDA) + set(IS_CUDA true) + + # TODO: parse TORCH_CUDA_ARCH_LIST -> CMAKE_CUDA_ARCHITECTURES? + + # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES + # set_target_properties(tgt PROPERTIES CUDA_ARCHITECTURES "35;50;72") + # TODO: PTX stuff + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # This indicates support for both real architectures (i.e, no ptx). + set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") + endif() +else() + message(FATAL_ERROR "Can't find CUDA or HIP installation.") +endif() + +if(NVCC_THREADS) + list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}") +endif() + +# +# Define target source files +# + +set(VLLM_EXT_SRC + "csrc/cache_kernels.cu" + "csrc/attention/attention_kernels.cu" + "csrc/pos_encoding_kernels.cu" + "csrc/activation_kernels.cu" + "csrc/layernorm_kernels.cu" + "csrc/quantization/squeezellm/quant_cuda_kernel.cu" + "csrc/quantization/gptq/q_gemm.cu" + "csrc/cuda_utils_kernels.cu" + "csrc/moe_align_block_size_kernels.cu" + "csrc/pybind.cpp") + +if(IS_CUDA) + list(APPEND VLLM_EXT_SRC + "csrc/quantization/awq/gemm_kernels.cu" + "csrc/custom_all_reduce.cu") +endif() + +File(GLOB VLLM_MOE_EXT_SRC "csrc/moe/*.cu" "csrc/moe/*.cpp") +File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp") + +# +# Define targets +# +set(CMAKE_CXX_STANDARD 17) + +function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS) + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + # Note: optimization level/debug info is set by build type + if (IS_CUDA) + set(CUDA_LANG "CUDA") + else() + set(CUDA_LANG "HIP") + endif() + target_compile_options(${MOD_NAME} PRIVATE + $<$:${MOD_NVCC_FLAGS}>) + target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}") + target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS}) + target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) +endfunction() + +define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}") +define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}") +define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}") diff --git a/setup.py b/setup.py index e955c70d80c19..69fcfa047ef97 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,6 @@ from setuptools.command.build_ext import build_ext from shutil import which import torch -# ROCM_HOME needed? from torch.utils.cpp_extension import CUDA_HOME ROOT_DIR = os.path.dirname(__file__) @@ -19,11 +18,6 @@ assert sys.platform.startswith( "linux"), "vLLM only supports Linux platform (including WSL)." -# If you are developing the C++ backend of vLLM, consider building vLLM with -# `python setup.py develop` since it will give you incremental builds. -# The downside is that this method is deprecated, see -# https://github.com/pypa/setuptools/issues/917 - MAIN_CUDA_VERSION = "12.1" @@ -301,7 +295,7 @@ def get_requirements() -> List[str]: ext_modules = [] if _is_cuda(): -# ext_modules.append(CMakeExtension(name="vllm._moe_C")) + ext_modules.append(CMakeExtension(name="vllm._moe_C")) if _install_punica(): ext_modules.append(CMakeExtension(name="vllm._punica_C")) From c9ac7ad590b67781e4f50f5fee5158122a72b194 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 03:06:38 +0000 Subject: [PATCH 04/76] try adding CMakeLists.txt to MANIFEST.in --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 0c897cf147f10..38c9e58b4e73e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include LICENSE include requirements.txt +include CMakeLists.txt recursive-include csrc * From 3123f57b73ee1e21a46a1bf68362c6753b5d96b4 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 03:08:23 +0000 Subject: [PATCH 05/76] try adding it to dockerfile --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 8be03b3567f0e..c2354ca1f470d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,6 +38,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # copy input files COPY csrc csrc COPY setup.py setup.py +COPY CMakeLists.txt CMakeLists.txt COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml COPY vllm/__init__.py vllm/__init__.py From a4d99ea8e442e886685da5775e8ec2cd1b0371b8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 03:12:30 +0000 Subject: [PATCH 06/76] add another path to CMAKE_PREFIX_PATH --- CMakeLists.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b78770ec7960..2783a5d119501 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,26 @@ endif() list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH}) +############### + +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "-c" + "import torch; print(torch.utils.cmake_prefix_path)" + OUTPUT_VARIABLE TORCH_PATH + ERROR_VARIABLE TORCH_PATH_ERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(TORCH_PATH STREQUAL "") + message(FATAL_ERROR "Failed to locate torch cmake_prefix_path," + " full error message:\n${TORCH_PATH_ERR}") +endif() + +list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH}) + +############### + + # # Find packages needed to compile # From 41dbdc959ee15b18e47b15a14ddcdd3a78a82b55 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 03:15:08 +0000 Subject: [PATCH 07/76] try again --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2783a5d119501..b1a8b7a6deacd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -51,7 +51,7 @@ list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH}) find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) find_package(Torch 2.1.2 EXACT REQUIRED) append_torchlib_if_found(torch_python) -find_package(MPI REQUIRED) +find_package(MPI) # find_package(MPI REQUIRED) execute_process( COMMAND From ac9d94bbdbb1e0ea974c47896d968ff544700cba Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 03:46:20 +0000 Subject: [PATCH 08/76] hack to test punica build --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b1a8b7a6deacd..02f1c1d8eaa8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,7 +115,9 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND") # TODO: PTX stuff if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) # This indicates support for both real architectures (i.e, no ptx). - set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") + # TODO: punica not supported for less than 8.0 + # set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") + set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90") endif() else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") From 102675d1f0d41b630d5ad65d0ab0a319b3f902a5 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 03:49:51 +0000 Subject: [PATCH 09/76] try again --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 02f1c1d8eaa8c..1f5fb846164e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,10 +115,10 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND") # TODO: PTX stuff if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) # This indicates support for both real architectures (i.e, no ptx). - # TODO: punica not supported for less than 8.0 - # set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") - set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90") + set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") endif() + # TODO: punica not supported for less than 8.0 + set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90") else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") endif() From dfbafe31beefdfdf1ce26e58c72906131486962a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 03:53:23 +0000 Subject: [PATCH 10/76] try again --- CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f5fb846164e4..ea70486e6a672 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,8 +117,6 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND") # This indicates support for both real architectures (i.e, no ptx). set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") endif() - # TODO: punica not supported for less than 8.0 - set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90") else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") endif() @@ -173,6 +171,10 @@ function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS) install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) endfunction() +# TODO: hacks punica not supported for less than 8.0 +set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90") +set(CUDA_ARCHITECTURES "80;86;89;90") + define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}") define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}") define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}") From 39a8589b11df0c3af2ffa5af8731a2dad9bc2043 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 04:05:23 +0000 Subject: [PATCH 11/76] cleanup path stuff. try hacking arches again --- CMakeLists.txt | 57 +++++++++++++++++++------------------------------- 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea70486e6a672..d5d47167654f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,46 +3,31 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) # -# Find where user site-packages are installed and add it to cmake's search path. +# Find where user site-packages and torch are installed and add it to cmake's search path. # -if(NOT DEFINED PYTHON_EXECUTABLE) - set(PYTHON_EXECUTABLE python3) -endif() - -execute_process( - COMMAND - "${PYTHON_EXECUTABLE}" "-c" - "import site; print(site.getusersitepackages())" - OUTPUT_VARIABLE SITE_PATH - ERROR_VARIABLE SITE_PATH_ERR - OUTPUT_STRIP_TRAILING_WHITESPACE) - -if(SITE_PATH STREQUAL "") - message(FATAL_ERROR "Failed to locate site-packages path," - " full error message:\n${SITE_PATH_ERR}") -endif() - -list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH}) - -############### +function (append_cmake_prefix_path PKG EXPR) + if(NOT DEFINED PYTHON_EXECUTABLE) + set(PYTHON_EXECUTABLE python3) + endif() -execute_process( + execute_process( COMMAND - "${PYTHON_EXECUTABLE}" "-c" - "import torch; print(torch.utils.cmake_prefix_path)" - OUTPUT_VARIABLE TORCH_PATH - ERROR_VARIABLE TORCH_PATH_ERR + "${PYTHON_EXECUTABLE}" "-c" "import ${PKG}; print(${EXPR})" + OUTPUT_VARIABLE PREFIX_PATH + ERROR_VARIABLE PREFIX_PATH_ERR OUTPUT_STRIP_TRAILING_WHITESPACE) -if(TORCH_PATH STREQUAL "") - message(FATAL_ERROR "Failed to locate torch cmake_prefix_path," - " full error message:\n${TORCH_PATH_ERR}") -endif() + if(PREFIX_PATH STREQUAL "") + message(FATAL_ERROR "Failed to locate ${PKG} path," + " full error message:\n${PREFIX_PATH_ERR}") + endif() -list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH}) + list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH}) +endfunction() -############### +append_cmake_prefix_path("site" "site.getusersitepackages()") +append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") # @@ -84,6 +69,10 @@ if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2") endif() +# hack arches to test +# TODO: need to strip out cuda arches < 8.0 for punica +set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90") + # # Check for existence of CUDA/HIP language support # @@ -171,10 +160,6 @@ function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS) install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) endfunction() -# TODO: hacks punica not supported for less than 8.0 -set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90") -set(CUDA_ARCHITECTURES "80;86;89;90") - define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}") define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}") define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}") From f66b286d419d42f8ea29dd152c00e7ee6cf11681 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 04:15:40 +0000 Subject: [PATCH 12/76] fix typo --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d5d47167654f8..4013e5fb3b3d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ function (append_cmake_prefix_path PKG EXPR) " full error message:\n${PREFIX_PATH_ERR}") endif() - list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH}) + list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) endfunction() append_cmake_prefix_path("site" "site.getusersitepackages()") From b2784e0d852f9931edb5b9444ea5601c18669446 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sun, 11 Feb 2024 04:27:50 +0000 Subject: [PATCH 13/76] change function to macro --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4013e5fb3b3d9..3d69efdbbd63f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ project(vllm_extensions LANGUAGES CXX) # Find where user site-packages and torch are installed and add it to cmake's search path. # -function (append_cmake_prefix_path PKG EXPR) +macro (append_cmake_prefix_path PKG EXPR) if(NOT DEFINED PYTHON_EXECUTABLE) set(PYTHON_EXECUTABLE python3) endif() @@ -24,7 +24,7 @@ function (append_cmake_prefix_path PKG EXPR) endif() list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) -endfunction() +endmacro() append_cmake_prefix_path("site" "site.getusersitepackages()") append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") From 5464e3072ee22380b4c648cb8ccdfdbdc7205ff2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 12 Feb 2024 22:25:35 +0000 Subject: [PATCH 14/76] flag hacking --- CMakeLists.txt | 106 +++++++++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3d69efdbbd63f..4ad10ad1e48b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,15 +2,8 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -# -# Find where user site-packages and torch are installed and add it to cmake's search path. -# - +# add comment macro (append_cmake_prefix_path PKG EXPR) - if(NOT DEFINED PYTHON_EXECUTABLE) - set(PYTHON_EXECUTABLE python3) - endif() - execute_process( COMMAND "${PYTHON_EXECUTABLE}" "-c" "import ${PKG}; print(${EXPR})" @@ -26,22 +19,32 @@ macro (append_cmake_prefix_path PKG EXPR) list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) endmacro() -append_cmake_prefix_path("site" "site.getusersitepackages()") -append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") - +# add comment why it comes before append_cmake_prefix_path +find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) +find_package(MPI) # +# Find where user site-packages and torch are installed and add it to cmake's search path. # Find packages needed to compile # -find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) +append_cmake_prefix_path("site" "site.getusersitepackages()") +append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") + + find_package(Torch 2.1.2 EXACT REQUIRED) append_torchlib_if_found(torch_python) -find_package(MPI) # find_package(MPI REQUIRED) +#set(ENV{TORCH_CUDA_ARCH_LIST} "70;75;80;86;89;90") + +# +# Setup NVCC flags +# + +# add comment execute_process( COMMAND "${PYTHON_EXECUTABLE}" "-c" - "import torch.utils.cpp_extension as torch_cpp_ext; print(' '.join(torch_cpp_ext.COMMON_NVCC_FLAGS))" + "import torch.utils.cpp_extension as torch_cpp_ext; print(';'.join(torch_cpp_ext.COMMON_NVCC_FLAGS))" OUTPUT_VARIABLE TORCH_NVCC_FLAGS ERROR_VARIABLE TORCH_NVCC_FLAGS_ERR OUTPUT_STRIP_TRAILING_WHITESPACE) @@ -51,27 +54,38 @@ if(TORCH_NVCC_FLAGS STREQUAL "") " full error message:\n${TORCH_NVCC_FLAGS_ERR}") endif() -string(STRIP ${TORCH_NVCC_FLAGS} TORCH_NVCC_FLAGS) -list(APPEND NVCC_FLAGS ${TORCH_NVCC_FLAGS}) +set(NVCC_FLAGS ${TORCH_NVCC_FLAGS}) + +if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) + list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2") +endif() + +if(NVCC_THREADS) + list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}") +endif() -set(PUNICA_NVCC_FLAGS "${NVCC_FLAGS}") +# +# Copy flags+update for punica +# +set(PUNICA_NVCC_FLAGS ${NVCC_FLAGS}) foreach(OPT "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__" ) - string(REPLACE ${OPT} "" PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS}) + list(REMOVE_ITEM PUNICA_NVCC_FLAGS ${OPT}) endforeach() -string(STRIP ${PUNICA_NVCC_FLAGS} PUNICA_NVCC_FLAGS) +#string(REPLACE " " " " PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS}) + +# remove gencode flags added by pytorch +list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode") +list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*") -if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) - list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2") -endif() -# hack arches to test -# TODO: need to strip out cuda arches < 8.0 for punica -set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90") +message("torch nvcc: ${CUDA_NVCC_FLAGS}") +message("nvcc: ${NVCC_FLAGS}") +message("punica nvcc: ${PUNICA_NVCC_FLAGS}") # # Check for existence of CUDA/HIP language support @@ -79,15 +93,15 @@ set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90") # https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html include(CheckLanguage) check_language(HIP) -check_language(CUDA) +#check_language(CUDA) if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND") enable_language(HIP) - list(APPEND NVCC_FLAGS "-DUSE_ROCM -U__HIP_NO_HALF_CONVERSIONS__ -U__HIP_NO_HALF_OPERATORS__") + list(APPEND NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__") # TODO: intersect with this list? if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942") + set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100") endif() foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) @@ -98,22 +112,23 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND") set(IS_CUDA true) # TODO: parse TORCH_CUDA_ARCH_LIST -> CMAKE_CUDA_ARCHITECTURES? + # cmake env var CUDAARCHS # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES # set_target_properties(tgt PROPERTIES CUDA_ARCHITECTURES "35;50;72") # TODO: PTX stuff - if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - # This indicates support for both real architectures (i.e, no ptx). - set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") - endif() +# if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) +# # This indicates support for both real architectures (i.e, no ptx). +# set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") +# endif() + + set(VLLM_CUDA_ARCHES "70;75;80;86;89;90") + set(VLLM_PUNICA_CUDA_ARCHES "80;86;89;90") # >8.0 of VLLM_CUDA_ARCHITECTURES + else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") endif() -if(NVCC_THREADS) - list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}") -endif() - # # Define target source files # @@ -144,22 +159,29 @@ File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp") # set(CMAKE_CXX_STANDARD 17) -function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS) +#this doesn't seem to work +#set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +# add comment +function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES) Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + #add_library(${MOD_NAME} MODULE ${MOD_SRC}) + set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}") # Note: optimization level/debug info is set by build type if (IS_CUDA) set(CUDA_LANG "CUDA") +# target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_NVCC_FLAGS}>) else() set(CUDA_LANG "HIP") +# target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_NVCC_FLAGS}>) endif() - target_compile_options(${MOD_NAME} PRIVATE - $<$:${MOD_NVCC_FLAGS}>) + target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_NVCC_FLAGS}>) target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}") target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) endfunction() -define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}") -define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}") -define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}") +define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") +define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") +define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") From 22e3803287fd2afe3f19a795ffdcbad9a89c73e1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 12 Feb 2024 23:00:57 +0000 Subject: [PATCH 15/76] strip arches out of CMAKE_CUDA_FLAGS --- CMakeLists.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ad10ad1e48b2..f868de2578521 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,14 +76,17 @@ foreach(OPT ) list(REMOVE_ITEM PUNICA_NVCC_FLAGS ${OPT}) endforeach() -#string(REPLACE " " " " PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS}) # remove gencode flags added by pytorch list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode") list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*") +#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "-gencode") +#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "arch=compute.*") +string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) message("torch nvcc: ${CUDA_NVCC_FLAGS}") +message("torch cuda_flags: ${CMAKE_CUDA_FLAGS}") message("nvcc: ${NVCC_FLAGS}") message("punica nvcc: ${PUNICA_NVCC_FLAGS}") @@ -93,7 +96,7 @@ message("punica nvcc: ${PUNICA_NVCC_FLAGS}") # https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html include(CheckLanguage) check_language(HIP) -#check_language(CUDA) +#check_language(CUDA) # picked up by torch if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND") enable_language(HIP) @@ -185,3 +188,9 @@ endfunction() define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") + +#get_cmake_property(_variableNames VARIABLES) +#list (SORT _variableNames) +#foreach (_variableName ${_variableNames}) +# message(STATUS "${_variableName}=${${_variableName}}") +#endforeach() From e55fc13fcec936bd2299b3fe9310330d9e07f833 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 12 Feb 2024 23:22:45 +0000 Subject: [PATCH 16/76] more shenanigans --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f868de2578521..2f35895b4432d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,9 @@ macro (append_cmake_prefix_path PKG EXPR) endmacro() # add comment why it comes before append_cmake_prefix_path +set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) +set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) find_package(MPI) # From 6fd7b599c95b1c892b437554854c63fcdd1cb270 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 12 Feb 2024 23:31:05 +0000 Subject: [PATCH 17/76] try again --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f35895b4432d..3781e3bf7ad9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,8 +20,10 @@ macro (append_cmake_prefix_path PKG EXPR) endmacro() # add comment why it comes before append_cmake_prefix_path +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) +set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) find_package(MPI) From 7c26517a1c283016c682ce214b40d368d8807edb Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 02:58:40 +0000 Subject: [PATCH 18/76] fiddling around --- CMakeLists.txt | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3781e3bf7ad9c..ad6246471a5ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,36 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) +# add comment why it comes before append_cmake_prefix_path +set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) +set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) +find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) +set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) +set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) +find_package(MPI) + +if(NOT DEFINED PYTHON_EXECUTABLE) + set(PYTHON_EXECUTABLE python3) +endif() + +# HACKS +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" "--version" + OUTPUT_VARIABLE JUNK) +message("junk: ${JUNK}") +execute_process( + COMMAND + "which" "-a" "python3" + OUTPUT_VARIABLE JUNK) +message("junk: ${JUNK}") +# HACKS + +# +# Find where user site-packages and torch are installed and add it to cmake's search path. +# Find packages needed to compile +# + # add comment macro (append_cmake_prefix_path PKG EXPR) execute_process( @@ -19,18 +49,6 @@ macro (append_cmake_prefix_path PKG EXPR) list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) endmacro() -# add comment why it comes before append_cmake_prefix_path -set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) -set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) -find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) -set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) -set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) -find_package(MPI) - -# -# Find where user site-packages and torch are installed and add it to cmake's search path. -# Find packages needed to compile -# append_cmake_prefix_path("site" "site.getusersitepackages()") append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") From b0a8531cb8cf19c0a849bc6e23813d60d09563d6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 03:42:56 +0000 Subject: [PATCH 19/76] add some debugging code --- CMakeLists.txt | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad6246471a5ee..d7976269d6924 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,19 +14,6 @@ if(NOT DEFINED PYTHON_EXECUTABLE) set(PYTHON_EXECUTABLE python3) endif() -# HACKS -execute_process( - COMMAND - "${PYTHON_EXECUTABLE}" "--version" - OUTPUT_VARIABLE JUNK) -message("junk: ${JUNK}") -execute_process( - COMMAND - "which" "-a" "python3" - OUTPUT_VARIABLE JUNK) -message("junk: ${JUNK}") -# HACKS - # # Find where user site-packages and torch are installed and add it to cmake's search path. # Find packages needed to compile @@ -187,8 +174,13 @@ set(CMAKE_CXX_STANDARD 17) #this doesn't seem to work #set(CUDA_PROPAGATE_HOST_FLAGS OFF) +message("pv: ${PYTHON_VERSION_STRING}") + +#set(PYTHON_SABI_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") + # add comment function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES) + # Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION}) Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) #add_library(${MOD_NAME} MODULE ${MOD_SRC}) set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}") @@ -211,8 +203,8 @@ define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") -#get_cmake_property(_variableNames VARIABLES) -#list (SORT _variableNames) -#foreach (_variableName ${_variableNames}) -# message(STATUS "${_variableName}=${${_variableName}}") -#endforeach() +get_cmake_property(_variableNames VARIABLES) +list (SORT _variableNames) +foreach (_variableName ${_variableNames}) + message(STATUS "${_variableName}=${${_variableName}}") +endforeach() From d0622c36f1fe07bdfd6444274b554c1d4f0f089c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 03:56:22 +0000 Subject: [PATCH 20/76] try exact python version match for debugging --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d7976269d6924..ecd64a6f1c8d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,9 +5,9 @@ project(vllm_extensions LANGUAGES CXX) # add comment why it comes before append_cmake_prefix_path set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) -find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) -set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) -set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) +find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module) +#set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) +#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) find_package(MPI) if(NOT DEFINED PYTHON_EXECUTABLE) From 49c0a9c96e67ee4f2061d71bfa61ae8887fc83a2 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 17:13:46 +0000 Subject: [PATCH 21/76] try with more permissive python version(s) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ecd64a6f1c8d3..91c13ea5f2ae4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ project(vllm_extensions LANGUAGES CXX) # add comment why it comes before append_cmake_prefix_path set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) -find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module) +find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) find_package(MPI) From 3cea6a48d79beb68fd26bf1a50885ea33c64ae90 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 17:22:58 +0000 Subject: [PATCH 22/76] add debugging --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 91c13ea5f2ae4..0d9a93e4a4451 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) # add comment why it comes before append_cmake_prefix_path +set(CMAKE_FIND_DEBUG_MODE TRUE) set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) From 812fab65da689e8d7da17bf63aa33660f92a37ec Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 17:26:36 +0000 Subject: [PATCH 23/76] add debugging --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d9a93e4a4451..64d74a9e2e170 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,8 @@ project(vllm_extensions LANGUAGES CXX) set(CMAKE_FIND_DEBUG_MODE TRUE) set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) -find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module) +find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) +set(CMAKE_FIND_DEBUG_MODE FALSE) #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) find_package(MPI) From 2654e8452eefd0aa5e24f6124e3d708077fa3163 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 17:45:09 +0000 Subject: [PATCH 24/76] try using find_package(Python3...) --- CMakeLists.txt | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 64d74a9e2e170..213553628efbb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,12 +4,21 @@ project(vllm_extensions LANGUAGES CXX) # add comment why it comes before append_cmake_prefix_path set(CMAKE_FIND_DEBUG_MODE TRUE) -set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) -set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) -find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) +#set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) +#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) + +find_package(Python3 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) + +#find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module) +#if (NOT Python_FOUND) +# find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) +#endif() + + set(CMAKE_FIND_DEBUG_MODE FALSE) #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) + find_package(MPI) if(NOT DEFINED PYTHON_EXECUTABLE) @@ -176,14 +185,14 @@ set(CMAKE_CXX_STANDARD 17) #this doesn't seem to work #set(CUDA_PROPAGATE_HOST_FLAGS OFF) -message("pv: ${PYTHON_VERSION_STRING}") +#message("pv: ${PYTHON_VERSION_STRING}") #set(PYTHON_SABI_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") # add comment function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES) # Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION}) - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + Python3_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) #add_library(${MOD_NAME} MODULE ${MOD_SRC}) set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}") # Note: optimization level/debug info is set by build type From d5b6a2dff887c83435e48718412d87db04c354df Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 17:53:32 +0000 Subject: [PATCH 25/76] try multiple find_package calls for python --- CMakeLists.txt | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 213553628efbb..1031a9cf65358 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,13 +7,10 @@ set(CMAKE_FIND_DEBUG_MODE TRUE) #set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) -find_package(Python3 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) - -#find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module) -#if (NOT Python_FOUND) -# find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) -#endif() - +find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module) +if (NOT Python_FOUND) + find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) +endif() set(CMAKE_FIND_DEBUG_MODE FALSE) #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) @@ -192,7 +189,7 @@ set(CMAKE_CXX_STANDARD 17) # add comment function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES) # Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION}) - Python3_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) #add_library(${MOD_NAME} MODULE ${MOD_SRC}) set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}") # Note: optimization level/debug info is set by build type From e70ebcef509313de32dc3b88e365016837db68a4 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 13 Feb 2024 17:55:21 +0000 Subject: [PATCH 26/76] try multiple find_packages for python --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1031a9cf65358..5ca1502590011 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(CMAKE_FIND_DEBUG_MODE TRUE) #set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) -find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module) +find_package(Python 3.8 EXACT COMPONENTS Interpreter Development.Module) if (NOT Python_FOUND) find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) endif() From 06f9d55fa88b3f337860c977f3f843be7de38514 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 14 Feb 2024 14:25:55 +0000 Subject: [PATCH 27/76] arch flag parsing in cmake, yay --- CMakeLists.txt | 199 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 130 insertions(+), 69 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca1502590011..eec144244145e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,28 +3,21 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) # add comment why it comes before append_cmake_prefix_path -set(CMAKE_FIND_DEBUG_MODE TRUE) -#set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL) -#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC) - +# add comment why we need to do this multiple times (TODO: maybe use a loop?) find_package(Python 3.8 EXACT COMPONENTS Interpreter Development.Module) if (NOT Python_FOUND) find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) endif() -set(CMAKE_FIND_DEBUG_MODE FALSE) -#set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE) -#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC) - -find_package(MPI) - if(NOT DEFINED PYTHON_EXECUTABLE) set(PYTHON_EXECUTABLE python3) endif() + +find_package(MPI) + # # Find where user site-packages and torch are installed and add it to cmake's search path. -# Find packages needed to compile # # add comment @@ -51,12 +44,12 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") find_package(Torch 2.1.2 EXACT REQUIRED) append_torchlib_if_found(torch_python) -#set(ENV{TORCH_CUDA_ARCH_LIST} "70;75;80;86;89;90") - # -# Setup NVCC flags +# Setup extra NVCC flags # +# TODO: IS_CUDA only? + # add comment execute_process( COMMAND @@ -71,79 +64,147 @@ if(TORCH_NVCC_FLAGS STREQUAL "") " full error message:\n${TORCH_NVCC_FLAGS_ERR}") endif() -set(NVCC_FLAGS ${TORCH_NVCC_FLAGS}) +set(VLLM_NVCC_FLAGS ${TORCH_NVCC_FLAGS}) if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) - list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2") + list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2") endif() if(NVCC_THREADS) - list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}") + list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}") endif() +set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS}) + # # Copy flags+update for punica # -set(PUNICA_NVCC_FLAGS ${NVCC_FLAGS}) + foreach(OPT "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__" ) - list(REMOVE_ITEM PUNICA_NVCC_FLAGS ${OPT}) + list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS ${OPT}) endforeach() +# +# deal with arch flags here +# + +# +# CUDA_NVCC_FLAGS holds the complete + canonical flags at this point +# make two versions: regular, punica +# strip out stuff from punica + update versions +# + +message("CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}") +message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") + + # remove gencode flags added by pytorch -list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode") -list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*") -#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "-gencode") -#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "arch=compute.*") +#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode") +#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*") +#string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) + + +# +# Setup arch flags +# + +string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS ${CMAKE_CUDA_FLAGS}) string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) +message("arch flags: ${VLLM_CUDA_ARCH_FLAGS}") +# filter ARCH_FLAGS and add them back into CMAKE_CUDA_FLAGS + +#set(VLLM_PUNICA_CUDA_ARCH_FLAGS ${VLLM_CUDA_ARCH_FLAGS}) +#list(FILTER VLLM_PUNICA_CUDA_ARCH_FLAGS EXCLUDE REGEX "compute_[1-7][0-9]") +message("post arch flags: ${VLLM_CUDA_ARCH_FLAGS}") +message("post punica arch flags: ${VLLM_PUNICA_CUDA_ARCH_FLAGS}") + +message("nvcc: ${VLLM_NVCC_FLAGS}") +message("punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") + +#list(APPEND VLLM_NVCC_FLAGS ${VLLM_CUDA_ARCH_FLAGS}) +#list(APPEND VLLM_PUNICA_NVCC_FLAGS ${VLLM_PUNICA_CUDA_ARCH_FLAGS}) + +# the painful way: NOTE needs to only happen w/CUDA +set(VLLM_CUDA_ARCHES) +set(VLLM_PUNICA_CUDA_ARCHES) + +macro(string_to_ver VER STR) + string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${VER} ${STR}) +endmacro() + +foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) + string(REGEX MATCH "arch=compute_\([0-9]+a*\)" COMPUTE ${ARCH}) + if (COMPUTE) + set(COMPUTE ${CMAKE_MATCH_1}) +# message("arch: ${COMPUTE}") + endif() + + string(REGEX MATCH "code=sm_\([0-9]+a*\)" SM ${ARCH}) + if (SM) + set(SM ${CMAKE_MATCH_1}) +# message("sm: ${SM}") + endif() + + string(REGEX MATCH "code=compute_\([0-9]+a*\)" CODE ${ARCH}) + if (CODE) + set(CODE ${CMAKE_MATCH_1}) +# message("code: ${CODE}") + endif() + + if (COMPUTE AND SM) + list(APPEND VLLM_CUDA_ARCHES ${SM}) + string_to_ver(SM_VER ${SM}) + if (SM_VER GREATER_EQUAL 8.0) + list(APPEND VLLM_PUNICA_CUDA_ARCHES ${SM}) + endif() + else() + list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual") + string_to_ver(CODE_VER ${CODE}) + if (CODE_VER GREATER_EQUAL 8.0) + list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE}-virtual") + endif() + endif() +endforeach() + +message("post nvcc: ${VLLM_NVCC_FLAGS}") +message("post punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") +message("post nvcc arch: ${VLLM_CUDA_ARCHES}") +message("post punica arch: ${VLLM_PUNICA_CUDA_ARCHES}") -message("torch nvcc: ${CUDA_NVCC_FLAGS}") -message("torch cuda_flags: ${CMAKE_CUDA_FLAGS}") -message("nvcc: ${NVCC_FLAGS}") -message("punica nvcc: ${PUNICA_NVCC_FLAGS}") # # Check for existence of CUDA/HIP language support # # https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html -include(CheckLanguage) -check_language(HIP) +#include(CheckLanguage) +#check_language(HIP) #check_language(CUDA) # picked up by torch -if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND") +# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat them. + +if(HIP_FOUND) enable_language(HIP) - list(APPEND NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__") + list(APPEND VLLM_NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__") # TODO: intersect with this list? if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100") + set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100") endif() foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) - list(APPEND NVCC_FLAGS "--offload-arch=${HIP_ARCH}") + list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}") endforeach() -elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND") +elseif(CUDA_FOUND) enable_language(CUDA) set(IS_CUDA true) - # TODO: parse TORCH_CUDA_ARCH_LIST -> CMAKE_CUDA_ARCHITECTURES? - # cmake env var CUDAARCHS - - # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES - # set_target_properties(tgt PROPERTIES CUDA_ARCHITECTURES "35;50;72") - # TODO: PTX stuff -# if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) -# # This indicates support for both real architectures (i.e, no ptx). -# set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") -# endif() - - set(VLLM_CUDA_ARCHES "70;75;80;86;89;90") - set(VLLM_PUNICA_CUDA_ARCHES "80;86;89;90") # >8.0 of VLLM_CUDA_ARCHITECTURES + # TODO: check supported? else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") @@ -171,6 +232,7 @@ if(IS_CUDA) "csrc/custom_all_reduce.cu") endif() +#TODO: list files File(GLOB VLLM_MOE_EXT_SRC "csrc/moe/*.cu" "csrc/moe/*.cpp") File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp") @@ -179,40 +241,39 @@ File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp") # set(CMAKE_CXX_STANDARD 17) -#this doesn't seem to work -#set(CUDA_PROPAGATE_HOST_FLAGS OFF) - -#message("pv: ${PYTHON_VERSION_STRING}") - -#set(PYTHON_SABI_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") - # add comment -function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES) - # Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION}) +function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS MOD_CUDA_ARCHES) Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - #add_library(${MOD_NAME} MODULE ${MOD_SRC}) + set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}") - # Note: optimization level/debug info is set by build type + if (IS_CUDA) set(CUDA_LANG "CUDA") -# target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_NVCC_FLAGS}>) else() set(CUDA_LANG "HIP") -# target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_NVCC_FLAGS}>) endif() - target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_NVCC_FLAGS}>) + + # Note: optimization level/debug info is set by build type + # target_compile_options(${MOD_NAME} BEFORE PRIVATE $<$:${MOD_EXTRA_NVCC_FLAGS}>) + target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_EXTRA_NVCC_FLAGS}>) + +# get_target_property(XXX ${MOD_NAME} COMPILE_OPTIONS) +# message("XXX: ${XXX}") +# get_target_property(XXX ${MOD_NAME} COMPILE_FEATURES) +# message("XXX: ${XXX}") + target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}") target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS}) target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) endfunction() -define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") -define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") -define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") +define_module_target(_C "${VLLM_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") +define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") +define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - message(STATUS "${_variableName}=${${_variableName}}") -endforeach() +#get_cmake_property(_variableNames VARIABLES) +#list (SORT _variableNames) +#foreach (_variableName ${_variableNames}) +# message(STATUS "${_variableName}=${${_variableName}}") +#endforeach() From ef12d5dfe5ea408197f1e962c251299963483e29 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 14 Feb 2024 14:50:50 +0000 Subject: [PATCH 28/76] filter out unsupported arches --- CMakeLists.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eec144244145e..edd6324c6dbb0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,6 +138,9 @@ macro(string_to_ver VER STR) string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${VER} ${STR}) endmacro() +set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") +set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100") + foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) string(REGEX MATCH "arch=compute_\([0-9]+a*\)" COMPUTE ${ARCH}) if (COMPUTE) @@ -158,14 +161,20 @@ foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) endif() if (COMPUTE AND SM) - list(APPEND VLLM_CUDA_ARCHES ${SM}) string_to_ver(SM_VER ${SM}) + if (NOT SM_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) + continue() + endif() + list(APPEND VLLM_CUDA_ARCHES ${SM}) if (SM_VER GREATER_EQUAL 8.0) list(APPEND VLLM_PUNICA_CUDA_ARCHES ${SM}) endif() else() - list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual") string_to_ver(CODE_VER ${CODE}) + if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) + continue() + endif() + list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual") if (CODE_VER GREATER_EQUAL 8.0) list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE}-virtual") endif() From 4d748676bd69dbcce15cdb9b209f1ae794314270 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 14 Feb 2024 14:51:11 +0000 Subject: [PATCH 29/76] filter out unsupported arches --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index edd6324c6dbb0..b27ee18d62e88 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,6 +163,7 @@ foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) if (COMPUTE AND SM) string_to_ver(SM_VER ${SM}) if (NOT SM_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) + # TODO: issue warning? continue() endif() list(APPEND VLLM_CUDA_ARCHES ${SM}) @@ -172,6 +173,7 @@ foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) else() string_to_ver(CODE_VER ${CODE}) if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) + # TODO: issue warning? continue() endif() list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual") From 9ee738d0b0eeb94b7584f01c1fed57755b1a9cd1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 14 Feb 2024 20:27:11 +0000 Subject: [PATCH 30/76] cleanups + add comments --- CMakeLists.txt | 423 +++++++++++++++++++++++++++++-------------------- 1 file changed, 248 insertions(+), 175 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b27ee18d62e88..18aadcd28d12a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,69 +2,126 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -# add comment why it comes before append_cmake_prefix_path -# add comment why we need to do this multiple times (TODO: maybe use a loop?) -find_package(Python 3.8 EXACT COMPONENTS Interpreter Development.Module) -if (NOT Python_FOUND) - find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module) -endif() - -if(NOT DEFINED PYTHON_EXECUTABLE) - set(PYTHON_EXECUTABLE python3) -endif() +# +# Supported python verions. These versions will be searched in order, the +# first match will be selected. +# +set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") +# Supported NVIDIA architectures +set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") -find_package(MPI) +# Supported AMD GPU architectures +set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100") # -# Find where user site-packages and torch are installed and add it to cmake's search path. +# Loop thru all supported python versions until we find the first match. +# Cmake is unable to pick the lowest supported version when multiple +# versions are available, even with CMAKE_FIND_PACKAGE_SORT_ORDER. # +foreach(VER ${PYTHON_SUPPORTED_VERSIONS}) + message(STATUS "Attempting to find python ${VER} package.") + find_package(Python ${VER} EXACT COMPONENTS Interpreter Development.Module) + if (Python_FOUND) + message(STATUS "Found python version ${VER} (${Python_EXECUTABLE}).") + break() + endif() +endforeach() -# add comment -macro (append_cmake_prefix_path PKG EXPR) +if (NOT Python_FOUND) + message(FATAL_ERROR + "No supported version of python found. ('${PYTHON_SUPPORTED_VERSIONS}')") +endif() + +# +# Run EXPR in python. The standard output of python is stored in OUT and has +# trailing whitespace stripped. If an error is encountered when running python, +# a fatal message ERR_MSG is issued. +# +macro (run_python OUT EXPR ERR_MSG) execute_process( COMMAND - "${PYTHON_EXECUTABLE}" "-c" "import ${PKG}; print(${EXPR})" - OUTPUT_VARIABLE PREFIX_PATH - ERROR_VARIABLE PREFIX_PATH_ERR + "${Python_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE ${OUT} + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR OUTPUT_STRIP_TRAILING_WHITESPACE) - if(PREFIX_PATH STREQUAL "") - message(FATAL_ERROR "Failed to locate ${PKG} path," - " full error message:\n${PREFIX_PATH_ERR}") + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") endif() +endmacro() + +# +# Try to find MPI package +# +find_package(MPI) +# +# Find where user site-packages and torch are installed and add it to cmake's +# search path. +# + +# Run EXPR in python after importing PKG. Use the result of this to extend +# CMAKE_PREFIX_PATH so we can import the torch cmake configuration. +macro (append_cmake_prefix_path PKG EXPR) + run_python(PREFIX_PATH + "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) endmacro() +# Add user site-packages and torch path to CMAKE_PREFIX_PATH append_cmake_prefix_path("site" "site.getusersitepackages()") append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") - +# +# Import torch cmake configuration. +# Torch also imports CUDA/HIP packages with some customizations, so we do not +# need to do this explicitly with check_language/enable_language, etc. +# find_package(Torch 2.1.2 EXACT REQUIRED) + +# For some reason torch does not add libtorch_python.so to the list of torch +# libraries to link. Find it by hand using 'append_torchlib_if_found' from +# torch's cmake setup. append_torchlib_if_found(torch_python) +if ((NOT HIP_FOUND) AND (NOT CUDA_FOUND)) + message(FATAL_ERROR "Can't find CUDA or HIP installation.") +endif() + +if (NOT HIP_FOUND AND CUDA_FOUND) + set(IS_CUDA true) +endif() + # # Setup extra NVCC flags # +# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat +# detect them explicitly with check_language, etc. +# +if (HIP_FOUND) + list(APPEND VLLM_NVCC_FLAGS + "-DUSE_ROCM" + "-U__HIP_NO_HALF_CONVERSIONS__" + "-U__HIP_NO_HALF_OPERATORS__") -# TODO: IS_CUDA only? - -# add comment -execute_process( - COMMAND - "${PYTHON_EXECUTABLE}" "-c" - "import torch.utils.cpp_extension as torch_cpp_ext; print(';'.join(torch_cpp_ext.COMMON_NVCC_FLAGS))" - OUTPUT_VARIABLE TORCH_NVCC_FLAGS - ERROR_VARIABLE TORCH_NVCC_FLAGS_ERR - OUTPUT_STRIP_TRAILING_WHITESPACE) + # TODO: intersect with this list? + if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) + set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100") + endif() -if(TORCH_NVCC_FLAGS STREQUAL "") - message(FATAL_ERROR "Unable to determine torch nvcc compiler flags," - " full error message:\n${TORCH_NVCC_FLAGS_ERR}") + foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) + list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}") + endforeach() endif() -set(VLLM_NVCC_FLAGS ${TORCH_NVCC_FLAGS}) +# TODO: IS_CUDA only? + +# Get common NVCC flags from torch. +run_python(VLLM_NVCC_FLAGS + "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2") @@ -80,149 +137,163 @@ set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS}) # Copy flags+update for punica # -foreach(OPT - "-D__CUDA_NO_HALF_OPERATORS__" - "-D__CUDA_NO_HALF_CONVERSIONS__" - "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" - "-D__CUDA_NO_HALF2_OPERATORS__" - ) - list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS ${OPT}) -endforeach() +list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS + "-D__CUDA_NO_HALF_OPERATORS__" + "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" + "-D__CUDA_NO_HALF2_OPERATORS__") -# -# deal with arch flags here -# +message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}") +message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") # -# CUDA_NVCC_FLAGS holds the complete + canonical flags at this point -# make two versions: regular, punica -# strip out stuff from punica + update versions +# Setup/process CUDA arch flags # - -message("CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}") -message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - - -# remove gencode flags added by pytorch -#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode") -#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*") -#string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) - - +# The torch cmake setup detects and hardcodes the detected architecture flags +# in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported +# architectures and the punica target. So we have to extract and remove all +# the '-gencode' flags from CMAKE_CUDA_FLAGS for processing. We can't use +# 'target_compiler_options' for adding '-gencode' arguments so we will use the +# target's CUDA_ARCHITECTURES property instead. This requires repackaging +# the architecture flags into a format that cmake expects for +# CUDA_ARCHITECTURES. # -# Setup arch flags +# This is a bit fragile in that it depends on torch using -gencode as opposed +# to one of the other nvcc options to specify architectures. # +# Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override +# detected architectures. +# +message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") + +# Extract all '-gencode' flags from CMAKE_CUDA_FLAGS +string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS + ${CMAKE_CUDA_FLAGS}) + +# Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying +# them and passing them back in via the CUDA_ARCHITECTURES property. +string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS + ${CMAKE_CUDA_FLAGS}) + +# If this error is triggered, it might mean that torch has changed how it sets +# up nvcc architecture code generation flags. +if (NOT VLLM_CUDA_ARCH_FLAGS) + message(FATAL_ERROR + "Could not find any architecture related code generation flags in " + "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})") +endif() -string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS ${CMAKE_CUDA_FLAGS}) -string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) -message("arch flags: ${VLLM_CUDA_ARCH_FLAGS}") -# filter ARCH_FLAGS and add them back into CMAKE_CUDA_FLAGS - -#set(VLLM_PUNICA_CUDA_ARCH_FLAGS ${VLLM_CUDA_ARCH_FLAGS}) -#list(FILTER VLLM_PUNICA_CUDA_ARCH_FLAGS EXCLUDE REGEX "compute_[1-7][0-9]") - -message("post arch flags: ${VLLM_CUDA_ARCH_FLAGS}") -message("post punica arch flags: ${VLLM_PUNICA_CUDA_ARCH_FLAGS}") - -message("nvcc: ${VLLM_NVCC_FLAGS}") -message("punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") +message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}") -#list(APPEND VLLM_NVCC_FLAGS ${VLLM_CUDA_ARCH_FLAGS}) -#list(APPEND VLLM_PUNICA_NVCC_FLAGS ${VLLM_PUNICA_CUDA_ARCH_FLAGS}) +# Macro for converting a 'gencode' version number to a cmake version number. +macro(string_to_ver OUT_VER IN_STR) + string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) +endmacro() -# the painful way: NOTE needs to only happen w/CUDA +# Initialize the architecure lists to empty. set(VLLM_CUDA_ARCHES) set(VLLM_PUNICA_CUDA_ARCHES) -macro(string_to_ver VER STR) - string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${VER} ${STR}) -endmacro() - -set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") -set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100") - +# Process each 'gencode' flag. foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) - string(REGEX MATCH "arch=compute_\([0-9]+a*\)" COMPUTE ${ARCH}) + # For each flag we want to extract the version number and whether + # it refers to PTX or native code. + # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding + # for that match. + + string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) if (COMPUTE) set(COMPUTE ${CMAKE_MATCH_1}) -# message("arch: ${COMPUTE}") endif() - string(REGEX MATCH "code=sm_\([0-9]+a*\)" SM ${ARCH}) + string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH}) if (SM) set(SM ${CMAKE_MATCH_1}) -# message("sm: ${SM}") endif() - string(REGEX MATCH "code=compute_\([0-9]+a*\)" CODE ${ARCH}) + string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH}) if (CODE) set(CODE ${CMAKE_MATCH_1}) -# message("code: ${CODE}") endif() - if (COMPUTE AND SM) - string_to_ver(SM_VER ${SM}) - if (NOT SM_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) - # TODO: issue warning? - continue() - endif() - list(APPEND VLLM_CUDA_ARCHES ${SM}) - if (SM_VER GREATER_EQUAL 8.0) - list(APPEND VLLM_PUNICA_CUDA_ARCHES ${SM}) - endif() + # Make sure the virtual architecture can be matched. + if (NOT COMPUTE) + message(FATAL_ERROR + "Could not determine virtual architecture from: ${ARCH}.") + endif() + + # One of sm_ or compute_ must exist. + if ((NOT SM) AND (NOT CODE)) + message(FATAL_ERROR + "Could not determine a codegen architecture from: ${ARCH}.") + endif() + + if (SM) + set(VIRT "") + set(CODE_ARCH ${SM}) else() - string_to_ver(CODE_VER ${CODE}) - if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) - # TODO: issue warning? - continue() - endif() - list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual") - if (CODE_VER GREATER_EQUAL 8.0) - list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE}-virtual") - endif() + set(VIRT "-virtual") + set(CODE_ARCH ${CODE}) endif() -endforeach() -message("post nvcc: ${VLLM_NVCC_FLAGS}") -message("post punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") -message("post nvcc arch: ${VLLM_CUDA_ARCHES}") -message("post punica arch: ${VLLM_PUNICA_CUDA_ARCHES}") + # Check if the current version is in the supported arch list + string_to_ver(CODE_VER ${CODE_ARCH}) + if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) + message(STATUS "discarding unsupported CUDA arch ${VER}.") + continue() + endif() + + # Add it to the arch list + list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}") + + # Add it to punica arch list if the version is >= 8.0 + if (CODE_VER GREATER_EQUAL 8.0) + list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}") + endif() +endforeach() +message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}") +message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}") # -# Check for existence of CUDA/HIP language support +# Define targets # -# https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html -#include(CheckLanguage) -#check_language(HIP) -#check_language(CUDA) # picked up by torch -# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat them. +# add comment +# Note: optimization level/debug info is set via cmake build type. +function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS + MOD_CUDA_ARCHES) -if(HIP_FOUND) - enable_language(HIP) - list(APPEND VLLM_NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__") + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - # TODO: intersect with this list? - if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) - set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100") + # TODO: needed for rocm? + if (IS_CUDA) + set(CUDA_LANG "CUDA") + else() + set(CUDA_LANG "HIP") endif() - foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) - list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}") - endforeach() -elseif(CUDA_FOUND) - enable_language(CUDA) - set(IS_CUDA true) + set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) - # TODO: check supported? + set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES + "${MOD_CUDA_ARCHES}") -else() - message(FATAL_ERROR "Can't find CUDA or HIP installation.") -endif() + target_compile_options(${MOD_NAME} PRIVATE + $<$:${MOD_EXTRA_NVCC_FLAGS}>) + + target_compile_definitions(${MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${MOD_NAME}") + + target_include_directories(${MOD_NAME} PRIVATE + csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS}) + + target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + + install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) +endfunction() # -# Define target source files +# _C extension # set(VLLM_EXT_SRC @@ -243,48 +314,50 @@ if(IS_CUDA) "csrc/custom_all_reduce.cu") endif() -#TODO: list files -File(GLOB VLLM_MOE_EXT_SRC "csrc/moe/*.cu" "csrc/moe/*.cpp") -File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp") +define_module_target(_C + "${VLLM_EXT_SRC}" + "${VLLM_NVCC_FLAGS}" + "${VLLM_CUDA_ARCHES}") # -# Define targets +# _moe_C extension # -set(CMAKE_CXX_STANDARD 17) - -# add comment -function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS MOD_CUDA_ARCHES) - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}") - - if (IS_CUDA) - set(CUDA_LANG "CUDA") - else() - set(CUDA_LANG "HIP") - endif() +set(VLLM_MOE_EXT_SRC + "csrc/moe/moe_ops.cpp" + "csrc/moe/topk_softmax_kernels.cu") - # Note: optimization level/debug info is set by build type - # target_compile_options(${MOD_NAME} BEFORE PRIVATE $<$:${MOD_EXTRA_NVCC_FLAGS}>) - target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_EXTRA_NVCC_FLAGS}>) +define_module_target(_moe_C + "${VLLM_MOE_EXT_SRC}" + "${VLLM_NVCC_FLAGS}" + "${VLLM_CUDA_ARCHES}") -# get_target_property(XXX ${MOD_NAME} COMPILE_OPTIONS) -# message("XXX: ${XXX}") -# get_target_property(XXX ${MOD_NAME} COMPILE_FEATURES) -# message("XXX: ${XXX}") - - target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}") - target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS}) - target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) - install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) -endfunction() - -define_module_target(_C "${VLLM_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") -define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}") -define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") +# +# _punica_C extension +# -#get_cmake_property(_variableNames VARIABLES) -#list (SORT _variableNames) -#foreach (_variableName ${_variableNames}) -# message(STATUS "${_variableName}=${${_variableName}}") -#endforeach() +set(VLLM_PUNICA_EXT_SRC + "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu" + "csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu" + "csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu" + "csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu" + "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu" + "csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu" + "csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu" + "csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu" + "csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu" + "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu" + "csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu" + "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu" + "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu" + "csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu" + "csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu" + "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu" + "csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu" + "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu" + "csrc/punica/punica_ops.cc") + +define_module_target(_punica_C + "${VLLM_PUNICA_EXT_SRC}" + "${VLLM_PUNICA_NVCC_FLAGS}" + "${VLLM_PUNICA_CUDA_ARCHES}") From 1e08118ecd5cef52f60e1c1e2af1ea06169fdd92 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 19 Feb 2024 20:51:09 -0500 Subject: [PATCH 31/76] wip --- CMakeLists.txt | 404 +++++++++++++++++++++++++++++++------------------ Dockerfile | 1 + hipify.py | 111 ++++++++++++++ setup.py | 6 +- 4 files changed, 373 insertions(+), 149 deletions(-) create mode 100755 hipify.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 18aadcd28d12a..666d213036995 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,16 +15,27 @@ set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100") # -# Loop thru all supported python versions until we find the first match. +# Loop thru all supported python versions until we find the first suitable +# version that has torch installed. +# # Cmake is unable to pick the lowest supported version when multiple # versions are available, even with CMAKE_FIND_PACKAGE_SORT_ORDER. # foreach(VER ${PYTHON_SUPPORTED_VERSIONS}) message(STATUS "Attempting to find python ${VER} package.") - find_package(Python ${VER} EXACT COMPONENTS Interpreter Development.Module) + find_package(Python ${VER} COMPONENTS Interpreter Development.Module) if (Python_FOUND) - message(STATUS "Found python version ${VER} (${Python_EXECUTABLE}).") - break() + execute_process( + COMMAND + "${Python_EXECUTABLE}" "-c" "import torch" + OUTPUT_VARIABLE PYTHON_OUT + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR) + + if(PYTHON_ERROR_CODE EQUAL 0) + message(STATUS "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).") + break() + endif() endif() endforeach() @@ -57,6 +68,9 @@ endmacro() # find_package(MPI) +#find_package(HIP) +enable_language(HIP) # use FindHIP? + # # Find where user site-packages and torch are installed and add it to cmake's # search path. @@ -79,7 +93,10 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") # Torch also imports CUDA/HIP packages with some customizations, so we do not # need to do this explicitly with check_language/enable_language, etc. # -find_package(Torch 2.1.2 EXACT REQUIRED) +#find_package(Torch 2.1.2 EXACT REQUIRED) +find_package(Torch REQUIRED) + +# TODO: warn about wrong version # For some reason torch does not add libtorch_python.so to the list of torch # libraries to link. Find it by hand using 'append_torchlib_if_found' from @@ -101,159 +118,183 @@ endif() # detect them explicitly with check_language, etc. # if (HIP_FOUND) - list(APPEND VLLM_NVCC_FLAGS - "-DUSE_ROCM" - "-U__HIP_NO_HALF_CONVERSIONS__" - "-U__HIP_NO_HALF_OPERATORS__") + message("HIP FOUND") +# enable_language(HIP) # use FindHIP? # TODO: intersect with this list? if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) - set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100") + set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}") + else() + set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}") endif() + set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}") - foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) - list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}") - endforeach() -endif() - -# TODO: IS_CUDA only? - -# Get common NVCC flags from torch. -run_python(VLLM_NVCC_FLAGS - "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" - "Failed to determine torch nvcc compiler flags") - -if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) - list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2") -endif() + # foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) + # list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}") + # endforeach() -if(NVCC_THREADS) - list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}") -endif() + set(VLLM_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES}) + set(VLLM_PUNICA_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES}) -set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS}) + # Get common NVCC flags from torch. + run_python(VLLM_NVCC_FLAGS + "from torch.utils.cpp_extension import COMMON_HIP_FLAGS; print(';'.join(COMMON_HIP_FLAGS))" + "Failed to determine torch nvcc compiler flags") -# -# Copy flags+update for punica -# - -list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS - "-D__CUDA_NO_HALF_OPERATORS__" - "-D__CUDA_NO_HALF_CONVERSIONS__" - "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" - "-D__CUDA_NO_HALF2_OPERATORS__") - -message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}") -message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") - -# -# Setup/process CUDA arch flags -# -# The torch cmake setup detects and hardcodes the detected architecture flags -# in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported -# architectures and the punica target. So we have to extract and remove all -# the '-gencode' flags from CMAKE_CUDA_FLAGS for processing. We can't use -# 'target_compiler_options' for adding '-gencode' arguments so we will use the -# target's CUDA_ARCHITECTURES property instead. This requires repackaging -# the architecture flags into a format that cmake expects for -# CUDA_ARCHITECTURES. -# -# This is a bit fragile in that it depends on torch using -gencode as opposed -# to one of the other nvcc options to specify architectures. -# -# Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override -# detected architectures. -# -message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - -# Extract all '-gencode' flags from CMAKE_CUDA_FLAGS -string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS - ${CMAKE_CUDA_FLAGS}) - -# Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying -# them and passing them back in via the CUDA_ARCHITECTURES property. -string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS - ${CMAKE_CUDA_FLAGS}) - -# If this error is triggered, it might mean that torch has changed how it sets -# up nvcc architecture code generation flags. -if (NOT VLLM_CUDA_ARCH_FLAGS) - message(FATAL_ERROR - "Could not find any architecture related code generation flags in " - "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})") -endif() - -message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}") - -# Macro for converting a 'gencode' version number to a cmake version number. -macro(string_to_ver OUT_VER IN_STR) - string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) -endmacro() + run_python(X + "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") -# Initialize the architecure lists to empty. -set(VLLM_CUDA_ARCHES) -set(VLLM_PUNICA_CUDA_ARCHES) + list(APPEND VLLM_NVCC_FLAGS "${X}") -# Process each 'gencode' flag. -foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) - # For each flag we want to extract the version number and whether - # it refers to PTX or native code. - # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding - # for that match. + list(APPEND VLLM_NVCC_FLAGS + "-DWITH_HIP" + "-DUSE_ROCM" + "-U__HIP_NO_HALF_CONVERSIONS__" + "-U__HIP_NO_HALF_OPERATORS__") - string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) - if (COMPUTE) - set(COMPUTE ${CMAKE_MATCH_1}) - endif() + # hack +# set(CMAKE_CUDA_COMPILER ${hip_HIPCC_EXECUTABLE}) # ${ROCM_PATH}/bin/hipcc) +# enable_language(CUDA) # use FindHIP? - string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH}) - if (SM) - set(SM ${CMAKE_MATCH_1}) - endif() +else() + # Get common NVCC flags from torch. + run_python(VLLM_NVCC_FLAGS + "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") - string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH}) - if (CODE) - set(CODE ${CMAKE_MATCH_1}) + if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) + list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2") endif() - # Make sure the virtual architecture can be matched. - if (NOT COMPUTE) - message(FATAL_ERROR - "Could not determine virtual architecture from: ${ARCH}.") + if(NVCC_THREADS) + list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}") endif() - # One of sm_ or compute_ must exist. - if ((NOT SM) AND (NOT CODE)) + set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS}) + + # + # Copy flags+update for punica + # + + list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS + "-D__CUDA_NO_HALF_OPERATORS__" + "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" + "-D__CUDA_NO_HALF2_OPERATORS__") + + message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}") + message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") + + # + # Setup/process CUDA arch flags + # + # The torch cmake setup detects and hardcodes the detected architecture flags + # in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported + # architectures and the punica target. So we have to extract and remove all + # the '-gencode' flags from CMAKE_CUDA_FLAGS for processing. We can't use + # 'target_compiler_options' for adding '-gencode' arguments so we will use the + # target's CUDA_ARCHITECTURES property instead. This requires repackaging + # the architecture flags into a format that cmake expects for + # CUDA_ARCHITECTURES. + # + # This is a bit fragile in that it depends on torch using -gencode as opposed + # to one of the other nvcc options to specify architectures. + # + # Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override + # detected architectures. + # + message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") + + # Extract all '-gencode' flags from CMAKE_CUDA_FLAGS + string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS + ${CMAKE_CUDA_FLAGS}) + + # Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying + # them and passing them back in via the CUDA_ARCHITECTURES property. + string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS + ${CMAKE_CUDA_FLAGS}) + + # If this error is triggered, it might mean that torch has changed how it sets + # up nvcc architecture code generation flags. + if (NOT VLLM_CUDA_ARCH_FLAGS) message(FATAL_ERROR - "Could not determine a codegen architecture from: ${ARCH}.") - endif() - - if (SM) - set(VIRT "") - set(CODE_ARCH ${SM}) - else() - set(VIRT "-virtual") - set(CODE_ARCH ${CODE}) - endif() - - # Check if the current version is in the supported arch list - string_to_ver(CODE_VER ${CODE_ARCH}) - if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) - message(STATUS "discarding unsupported CUDA arch ${VER}.") - continue() + "Could not find any architecture related code generation flags in " + "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})") endif() - # Add it to the arch list - list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}") - - # Add it to punica arch list if the version is >= 8.0 - if (CODE_VER GREATER_EQUAL 8.0) - list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}") - endif() -endforeach() + message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}") + + # Macro for converting a 'gencode' version number to a cmake version number. + macro(string_to_ver OUT_VER IN_STR) + string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) + endmacro() + + # Initialize the architecure lists to empty. + set(VLLM_CUDA_ARCHES) + set(VLLM_PUNICA_CUDA_ARCHES) + + # Process each 'gencode' flag. + foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) + # For each flag we want to extract the version number and whether + # it refers to PTX or native code. + # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding + # for that match. + + string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) + if (COMPUTE) + set(COMPUTE ${CMAKE_MATCH_1}) + endif() + + string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH}) + if (SM) + set(SM ${CMAKE_MATCH_1}) + endif() + + string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH}) + if (CODE) + set(CODE ${CMAKE_MATCH_1}) + endif() + + # Make sure the virtual architecture can be matched. + if (NOT COMPUTE) + message(FATAL_ERROR + "Could not determine virtual architecture from: ${ARCH}.") + endif() + + # One of sm_ or compute_ must exist. + if ((NOT SM) AND (NOT CODE)) + message(FATAL_ERROR + "Could not determine a codegen architecture from: ${ARCH}.") + endif() + + if (SM) + set(VIRT "") + set(CODE_ARCH ${SM}) + else() + set(VIRT "-virtual") + set(CODE_ARCH ${CODE}) + endif() + + # Check if the current version is in the supported arch list + string_to_ver(CODE_VER ${CODE_ARCH}) + if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) + message(STATUS "discarding unsupported CUDA arch ${VER}.") + continue() + endif() + + # Add it to the arch list + list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}") + + # Add it to punica arch list if the version is >= 8.0 + if (CODE_VER GREATER_EQUAL 8.0) + list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}") + endif() + endforeach() -message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}") -message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}") + message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}") + message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}") +endif() # # Define targets @@ -264,19 +305,37 @@ message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}") function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS MOD_CUDA_ARCHES) - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - # TODO: needed for rocm? if (IS_CUDA) + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + set(CUDA_LANG "CUDA") + set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES + "${MOD_CUDA_ARCHES}") + else() + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) +# hip_add_library(${MOD_NAME} MODULE ${MOD_SRC}) + set(CUDA_LANG "HIP") + foreach(SRC ${MOD_SRC}) + if (${SRC} MATCHES "\.hip$") + message("setting HIP on ${SRC}") + set_source_files_properties(${SRC} PROPERTIES LANGUAGE HIP) + #set_source_files_properties(${SRC} PROPERTIES LANGUAGE CUDA) + #set_source_files_properties(${SRC} PROPERTIES LANGUAGE ${CUDA_LANG}) + endif() + endforeach() + + message("got here! ${MOD_CUDA_ARCHES}") + set_target_properties(${MOD_NAME} PROPERTIES HIP_ARCHITECTURES + "${MOD_CUDA_ARCHES}") + endif() set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) - set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES - "${MOD_CUDA_ARCHES}") +# set_target_properties(${MOD_NAME} PROPERTIES LINKER_LANGUAGE CXX) target_compile_options(${MOD_NAME} PRIVATE $<$:${MOD_EXTRA_NVCC_FLAGS}>) @@ -285,9 +344,17 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS "-DTORCH_EXTENSION_NAME=${MOD_NAME}") target_include_directories(${MOD_NAME} PRIVATE - csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS}) + csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS}) - target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + + if (TRUE OR IS_CUDA) + target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + else() + # -- Python_SOABI=cpython-39-x86_64-linux-gnu + message("got here ${_PYTHON_INCLUDE_DIR}, ${_PYTHON_LIBRARY}") + target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES} ${_Python_LIBRARY}) + target_include_directories(${MOD_NAME} PRIVATE ${Python_INCLUDE_DIRS}) + endif() install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) endfunction() @@ -305,15 +372,56 @@ set(VLLM_EXT_SRC "csrc/quantization/squeezellm/quant_cuda_kernel.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/cuda_utils_kernels.cu" - "csrc/moe_align_block_size_kernels.cu" - "csrc/pybind.cpp") + "csrc/moe_align_block_size_kernels.cu") + +set(VLLM_EXT_ROCM_SRC + "csrc/cache_kernels.hip" + "csrc/attention/attention_kernels.hip" + "csrc/pos_encoding_kernels.hip" + "csrc/activation_kernels.hip" + "csrc/layernorm_kernels.hip" + "csrc/quantization/squeezellm/quant_hip_kernel.hip" + "csrc/quantization/gptq/q_gemm.hip" + "csrc/hip_utils_kernels.hip" + "csrc/moe_align_block_size_kernels.hip") if(IS_CUDA) list(APPEND VLLM_EXT_SRC "csrc/quantization/awq/gemm_kernels.cu" "csrc/custom_all_reduce.cu") +elseif(HIP_FOUND) + # maybe use add_custom_target instead + add_dependencies? + message("build dir ${CMAKE_CURRENT_BINARY_DIR}") + + set(X) + foreach (SRC ${VLLM_EXT_SRC}) + list(APPEND X ${SRC}) + endforeach() + + message("cmake cwd: ${CMAKE_CURRENT_BINARY_DIR}") + add_custom_command( + COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc . + COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC} + DEPENDS hipify.py ${VLLM_EXT_SRC} + OUTPUT ${VLLM_EXT_ROCM_SRC}) + +# add_custom_command( +# COMMAND ./hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -i csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC} +# DEPENDS hipify.py ${VLLM_EXT_SRC} +# WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} +# OUTPUT ${VLLM_EXT_ROCM_SRC} +# VERBATIM) + +# set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC}) + set(VLLM_EXT_SRC) + foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC}) + list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC}) + endforeach() + message("final src: ${VLLM_EXT_SRC}") endif() +list(APPEND VLM_EXT_SRC "csrc/pybind.cpp") #? + define_module_target(_C "${VLLM_EXT_SRC}" "${VLLM_NVCC_FLAGS}" @@ -361,3 +469,9 @@ define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") + +get_cmake_property(_variableNames VARIABLES) +list (SORT _variableNames) +foreach (_variableName ${_variableNames}) + message(STATUS "${_variableName}=${${_variableName}}") +endforeach() diff --git a/Dockerfile b/Dockerfile index c2354ca1f470d..97e629dc07abb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,6 +38,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # copy input files COPY csrc csrc COPY setup.py setup.py +COPY hipify.py hipify.py COPY CMakeLists.txt CMakeLists.txt COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml diff --git a/hipify.py b/hipify.py new file mode 100755 index 0000000000000..93e5c9d78d6a4 --- /dev/null +++ b/hipify.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +import argparse +import os + +from torch.utils.hipify.hipify_python import hipify + +if __name__ == '__main__': + print(f"CWD {os.getcwd()}") + + parser = argparse.ArgumentParser() + + parser.add_argument( + "-b", + "--build_dir", + help="The build directory.", + ) + + parser.add_argument( + "-o", + "--output_dir", + help="The output directory.", + ) + + parser.add_argument( + "-i", + "--include_dir", + help="Include directory", + action="append", + default=[], + ) + + parser.add_argument( + "sources", + help="Source files to hipify.", + nargs="*", + default=[] + ) + + args = parser.parse_args() + + print(args.output_dir) + + # limit scope to build_dir only + includes = [os.path.join(args.build_dir, '*')] + print(f"includes {includes}") + + extra_files = [os.path.abspath(s) for s in args.sources] + print(f"extra_files {extra_files}") + + hipify_result = hipify( + project_directory=args.build_dir, + output_directory=args.output_dir, + header_include_dirs=[], + includes=includes, + extra_files=extra_files, + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True) + + #print(hipify_result) + + hipified_sources = [] + for source in args.sources: + s_abs = os.path.abspath(source) + hipified_s_abs = (hipify_result[s_abs].hipified_path if (s_abs in hipify_result and + hipify_result[s_abs].hipified_path is not None) else s_abs) + if True: + hipified_sources.append(hipified_s_abs) + else: + hipified_sources.append( + os.path.relpath(hipified_s_abs, + os.path.abspath(os.path.join(args.build_dir, os.pardir)))) + + assert(len(hipified_sources) == len(args.sources)) + + # print("\n".join(hipified_sources)) + +# print(f"got here {args.output_dir}") +# os.system(f"find {args.output_dir} -name '*.hip'") +# print("end got here") + +# print(f"got here root") +# os.system(f"find /app/vllm -name '*.hip'") +# print("end got here root") + +# project_directory /app/vllm +# show_detailed True +# extensions ('.cu', '.cuh', '.c', '.cc', '.cpp', '.h', '.in', '.hpp') +# header_extensions ('.cuh', '.h', '.hpp') +# output_directory /app/vllm +# header_include_dirs [] +# includes ['/app/vllm/*'] +# extra_files [ +# '/app/vllm/csrc/cache_kernels.cu', +# '/app/vllm/csrc/attention/attention_kernels.cu', +# '/app/vllm/csrc/pos_encoding_kernels.cu', +# '/app/vllm/csrc/activation_kernels.cu', +# '/app/vllm/csrc/layernorm_kernels.cu', +# '/app/vllm/csrc/quantization/squeezellm/quant_cuda_kernel.cu', +# '/app/vllm/csrc/quantization/gptq/q_gemm.cu', +# '/app/vllm/csrc/cuda_utils_kernels.cu', +# '/app/vllm/csrc/moe_align_block_size_kernels.cu', +# '/app/vllm/csrc/pybind.cpp' +# ] +# out_of_place_only False +# ignores () +# show_progress True +# hip_clang_launch False +# is_pytorch_extension True +# hipify_extra_files_only True diff --git a/setup.py b/setup.py index 69fcfa047ef97..79e9f852db76f 100644 --- a/setup.py +++ b/setup.py @@ -69,10 +69,7 @@ def build_extensions(self): # temporary build directory instead '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format( cfg.upper(), self.build_temp), - # Hint CMake to use the same Python executable that - # is launching the build, prevents possible mismatching if - # multiple versions of Python are installed - '-DPYTHON_EXECUTABLE={}'.format(sys.executable), + '--log-level=TRACE', ] # TODO: change default to 0 @@ -113,6 +110,7 @@ def build_extensions(self): build_jobs = ['-j', str(num_jobs)] # Config + # TODO: this only needs to happen once subprocess.check_call(['cmake', ext.cmake_lists_dir] + build_tool + cmake_args, cwd=self.build_temp) From 3d042550aa742a6c7ca84b7ef2003fec8eda99b1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 19 Feb 2024 21:11:45 -0500 Subject: [PATCH 32/76] hacked up rocm support --- CMakeLists.txt | 65 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 666d213036995..18a3355241cd5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,15 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) +function (dumpvars MSG) + message(${MSG}) + get_cmake_property(_variableNames VARIABLES) + list (SORT _variableNames) + foreach (_variableName ${_variableNames}) + message(STATUS "${_variableName}=${${_variableName}}") + endforeach() +endfunction() + # # Supported python verions. These versions will be searched in order, the # first match will be selected. @@ -148,10 +157,11 @@ if (HIP_FOUND) list(APPEND VLLM_NVCC_FLAGS "${X}") list(APPEND VLLM_NVCC_FLAGS - "-DWITH_HIP" +# "-DWITH_HIP" "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" - "-U__HIP_NO_HALF_OPERATORS__") + "-U__HIP_NO_HALF_OPERATORS__" + "-fno-gpu-rdc") # hack # set(CMAKE_CUDA_COMPILER ${hip_HIPCC_EXECUTABLE}) # ${ROCM_PATH}/bin/hipcc) @@ -317,6 +327,8 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) # hip_add_library(${MOD_NAME} MODULE ${MOD_SRC}) + add_dependencies(${MOD_NAME} hipify) + set(CUDA_LANG "HIP") foreach(SRC ${MOD_SRC}) if (${SRC} MATCHES "\.hip$") @@ -390,20 +402,37 @@ if(IS_CUDA) "csrc/quantization/awq/gemm_kernels.cu" "csrc/custom_all_reduce.cu") elseif(HIP_FOUND) + dumpvars("VARS") + # maybe use add_custom_target instead + add_dependencies? message("build dir ${CMAKE_CURRENT_BINARY_DIR}") set(X) - foreach (SRC ${VLLM_EXT_SRC}) - list(APPEND X ${SRC}) + foreach (SRC ${VLLM_EXT_ROCM_SRC}) + list(APPEND X "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") endforeach() + set(VLLM_EXT_ROCM_SRC ${X}) message("cmake cwd: ${CMAKE_CURRENT_BINARY_DIR}") - add_custom_command( - COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc . - COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC} +# add_custom_command( +# COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO" +# COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc . +# COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC} +# DEPENDS hipify.py ${VLLM_EXT_SRC} +# OUTPUT ${VLLM_EXT_ROCM_SRC} +# COMMENT "run hipify") + + message("rocm src: ${VLLM_EXT_ROCM_SRC}") + + add_custom_target( + hipify + COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO2" + COMMAND pwd + COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc -i ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC} DEPENDS hipify.py ${VLLM_EXT_SRC} - OUTPUT ${VLLM_EXT_ROCM_SRC}) + BYPRODUCTS ${VLLM_EXT_ROCM_SRC} + COMMENT "run hipify2") # add_custom_command( # COMMAND ./hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -i csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC} @@ -412,15 +441,17 @@ elseif(HIP_FOUND) # OUTPUT ${VLLM_EXT_ROCM_SRC} # VERBATIM) -# set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC}) - set(VLLM_EXT_SRC) - foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC}) - list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC}) - endforeach() + set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC}) + +# set(VLLM_EXT_SRC) +# foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC}) +# list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC}) +# endforeach() message("final src: ${VLLM_EXT_SRC}") endif() -list(APPEND VLM_EXT_SRC "csrc/pybind.cpp") #? +list(APPEND VLLM_EXT_SRC "csrc/pybind.cpp") # or leave in original list? +#set(VLLM_EXT_SRC "csrc/pybind.cpp") define_module_target(_C "${VLLM_EXT_SRC}" @@ -469,9 +500,3 @@ define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}") - -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - message(STATUS "${_variableName}=${${_variableName}}") -endforeach() From 84bdbc35e7f1805205e572a2a8e959d5e7a8cd7f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 19 Feb 2024 21:13:32 -0500 Subject: [PATCH 33/76] fix merge issue --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 18a3355241cd5..e07273eca2a58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") # Supported AMD GPU architectures -set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100") +set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") # # Loop thru all supported python versions until we find the first suitable From 6ad7e54e80b7166569638638461c607fcea606c0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 19 Feb 2024 21:22:27 -0500 Subject: [PATCH 34/76] run format.sh --- hipify.py | 40 ++++++++++++++++++++-------------------- setup.py | 1 - 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/hipify.py b/hipify.py index 93e5c9d78d6a4..d7cb1a0ed0ed6 100755 --- a/hipify.py +++ b/hipify.py @@ -30,12 +30,10 @@ default=[], ) - parser.add_argument( - "sources", - help="Source files to hipify.", - nargs="*", - default=[] - ) + parser.add_argument("sources", + help="Source files to hipify.", + nargs="*", + default=[]) args = parser.parse_args() @@ -48,31 +46,33 @@ extra_files = [os.path.abspath(s) for s in args.sources] print(f"extra_files {extra_files}") - hipify_result = hipify( - project_directory=args.build_dir, - output_directory=args.output_dir, - header_include_dirs=[], - includes=includes, - extra_files=extra_files, - show_detailed=True, - is_pytorch_extension=True, - hipify_extra_files_only=True) + hipify_result = hipify(project_directory=args.build_dir, + output_directory=args.output_dir, + header_include_dirs=[], + includes=includes, + extra_files=extra_files, + show_detailed=True, + is_pytorch_extension=True, + hipify_extra_files_only=True) #print(hipify_result) hipified_sources = [] for source in args.sources: s_abs = os.path.abspath(source) - hipified_s_abs = (hipify_result[s_abs].hipified_path if (s_abs in hipify_result and - hipify_result[s_abs].hipified_path is not None) else s_abs) + hipified_s_abs = (hipify_result[s_abs].hipified_path if + (s_abs in hipify_result + and hipify_result[s_abs].hipified_path is not None) + else s_abs) if True: hipified_sources.append(hipified_s_abs) else: hipified_sources.append( - os.path.relpath(hipified_s_abs, - os.path.abspath(os.path.join(args.build_dir, os.pardir)))) + os.path.relpath( + hipified_s_abs, + os.path.abspath(os.path.join(args.build_dir, os.pardir)))) - assert(len(hipified_sources) == len(args.sources)) + assert (len(hipified_sources) == len(args.sources)) # print("\n".join(hipified_sources)) diff --git a/setup.py b/setup.py index 79e9f852db76f..006897de3812b 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ import os import re import subprocess -import sys from typing import List from packaging.version import parse, Version From 1d50fa7fea622f14b1870ada4e48b63202cdd7d0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 19 Feb 2024 21:23:30 -0500 Subject: [PATCH 35/76] fix enable_language --- CMakeLists.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e07273eca2a58..17709e09d0a18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,9 +77,6 @@ endmacro() # find_package(MPI) -#find_package(HIP) -enable_language(HIP) # use FindHIP? - # # Find where user site-packages and torch are installed and add it to cmake's # search path. @@ -128,7 +125,7 @@ endif() # if (HIP_FOUND) message("HIP FOUND") -# enable_language(HIP) # use FindHIP? + enable_language(HIP) # use FindHIP? # TODO: intersect with this list? if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) From 10e35263a714260124ad58923881e2e924cbf753 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 00:37:00 -0500 Subject: [PATCH 36/76] cleanups --- CMakeLists.txt | 261 ++++++++++++++++++++++--------------------------- 1 file changed, 117 insertions(+), 144 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 17709e09d0a18..61208af169e84 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,15 +2,6 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -function (dumpvars MSG) - message(${MSG}) - get_cmake_property(_variableNames VARIABLES) - list (SORT _variableNames) - foreach (_variableName ${_variableNames}) - message(STATUS "${_variableName}=${${_variableName}}") - endforeach() -endfunction() - # # Supported python verions. These versions will be searched in order, the # first match will be selected. @@ -99,11 +90,8 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") # Torch also imports CUDA/HIP packages with some customizations, so we do not # need to do this explicitly with check_language/enable_language, etc. # -#find_package(Torch 2.1.2 EXACT REQUIRED) find_package(Torch REQUIRED) -# TODO: warn about wrong version - # For some reason torch does not add libtorch_python.so to the list of torch # libraries to link. Find it by hand using 'append_torchlib_if_found' from # torch's cmake setup. @@ -115,83 +103,109 @@ endif() if (NOT HIP_FOUND AND CUDA_FOUND) set(IS_CUDA true) + + # Verify torch version and warn if it is not expected. + if (NOT Torch_VERSION VERSION_EQUAL 2.1.2) + message(WARNING "Pytorch version 2.1.2 expected for CUDA build, " + "saw ${Torch_VERSION} instead.") + endif() +else() + # Verify torch version and warn if it is not expected (derived from Dockerfile.rocm) + # ROCm 5.7 -> torch 2.0.1 + if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND NOT Torch_VERSION VERSION_EQUAL 2.0.1) + message(WARNING "Pytorch version 2.0.1 expected for ROCMm 5.x build, " + "saw ${Torch_VERSION} instead.") + endif() + + # ROCm 6.0 -> torch 2.1.1 + if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND NOT Torch_VERSION VERSION_EQUAL 2.1.1) + message(WARNING "Pytorch version 2.1.1 expected for ROCMm 6.x build, " + "saw ${Torch_VERSION} instead.") + endif() endif() # -# Setup extra NVCC flags +# Setup extra platform specific GPU compilation flags, e.g. NVCC flags for CUDA +# and hip flags for ROCm. # # Note: CUDA + HIP are detected by pytorch package so there's no need to repeat # detect them explicitly with check_language, etc. # if (HIP_FOUND) - message("HIP FOUND") - enable_language(HIP) # use FindHIP? + # Importing torch recognizes and sets up some HIP/ROCm stuff but not all. + # If we want cmake to be able to understand the .hip extension automatically, + # we need to enable HIP explicitly. + enable_language(HIP) - # TODO: intersect with this list? - if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) - set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}") - else() - set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}") - endif() - set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}") + # + # VLLM_HIP_ARCHITECUTRES will control the offload-arch flags. + # CMAKE_HIP_ARCHITECTURES is setup by pytorch and can be controlled + # via the PYTORCH_ROCM_ARCH env variable. + # + # Find the intersection of the supported + detected architectures to + # set the module architecture flags. + # + set(VLLM_HIP_ARCHITECTURES) + foreach (ARCH ${CMAKE_HIP_ARCHITECTURES}) + if (ARCH IN_LIST ROCM_SUPPORTED_ARCHS) + list(APPEND VLLM_HIP_ARCHITECTURES ${ARCH}) + endif() + endforeach() - # foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES}) - # list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}") - # endforeach() + if(NOT VLLM_HIP_ARCHITECTURES) + message(FATAL_ERROR + "None of the detected ROCM architectures: ${CMAKE_HIP_ARCHITECTURES} is" + " supported. Supported ROCM architectures are: ${ROCM_SUPPORTED_ARCHS}.") + endif() - set(VLLM_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES}) - set(VLLM_PUNICA_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES}) + set(VLLM_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES}) + set(VLLM_PUNICA_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES}) - # Get common NVCC flags from torch. - run_python(VLLM_NVCC_FLAGS + # Get common HIP/HIPCC flags from torch. + run_python(VLLM_GPU_FLAGS "from torch.utils.cpp_extension import COMMON_HIP_FLAGS; print(';'.join(COMMON_HIP_FLAGS))" "Failed to determine torch nvcc compiler flags") - run_python(X + run_python(VLLM_HIPCC_FLAGS "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))" "Failed to determine torch nvcc compiler flags") - list(APPEND VLLM_NVCC_FLAGS "${X}") + list(APPEND VLLM_GPU_FLAGS ${VLLM_HIPCC_FLAGS}) - list(APPEND VLLM_NVCC_FLAGS -# "-DWITH_HIP" + list(APPEND VLLM_GPU_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__" "-fno-gpu-rdc") - # hack -# set(CMAKE_CUDA_COMPILER ${hip_HIPCC_EXECUTABLE}) # ${ROCM_PATH}/bin/hipcc) -# enable_language(CUDA) # use FindHIP? - else() # Get common NVCC flags from torch. - run_python(VLLM_NVCC_FLAGS + run_python(VLLM_GPU_FLAGS "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" "Failed to determine torch nvcc compiler flags") if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) - list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2") + list(APPEND VLLM_GPU_FLAGS "-DENABLE_FP8_E5M2") endif() if(NVCC_THREADS) - list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}") + list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() - set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS}) + set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS}) # # Copy flags+update for punica # - list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS + list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__") - message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}") - message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}") + message(DEBUG "nvcc: ${VLLM_GPU_FLAGS}") + message(DEBUG "punica nvcc: ${VLLM_PUNICA_GPU_FLAGS}") # # Setup/process CUDA arch flags @@ -238,8 +252,8 @@ else() endmacro() # Initialize the architecure lists to empty. - set(VLLM_CUDA_ARCHES) - set(VLLM_PUNICA_CUDA_ARCHES) + set(VLLM_GPU_ARCHES) + set(VLLM_PUNICA_GPU_ARCHES) # Process each 'gencode' flag. foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) @@ -291,16 +305,16 @@ else() endif() # Add it to the arch list - list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}") + list(APPEND VLLM_GPU_ARCHES "${CODE_ARCH}${VIRT}") # Add it to punica arch list if the version is >= 8.0 if (CODE_VER GREATER_EQUAL 8.0) - list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}") + list(APPEND VLLM_PUNICA_GPU_ARCHES "${CODE_ARCH}${VIRT}") endif() endforeach() - message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}") - message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}") + message(DEBUG "nvcc arch: ${VLLM_GPU_ARCHES}") + message(DEBUG "punica arch: ${VLLM_PUNICA_GPU_ARCHES}") endif() # @@ -309,45 +323,30 @@ endif() # add comment # Note: optimization level/debug info is set via cmake build type. -function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS - MOD_CUDA_ARCHES) +function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS + MOD_GPU_ARCHES) - # TODO: needed for rocm? - if (IS_CUDA) - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - - set(CUDA_LANG "CUDA") - set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES - "${MOD_CUDA_ARCHES}") + # Note: for ROCm builds we let the proper flags and libraries get + # pulled in by torch. + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + if (IS_CUDA) + set(GPU_LANG "CUDA") else() - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) -# hip_add_library(${MOD_NAME} MODULE ${MOD_SRC}) - + set(GPU_LANG "HIP") + # Make this target dependent on the hipify preprocessor step. + # TODO: consider moving hipify step into here so it could apply to + # any target add_dependencies(${MOD_NAME} hipify) - - set(CUDA_LANG "HIP") - foreach(SRC ${MOD_SRC}) - if (${SRC} MATCHES "\.hip$") - message("setting HIP on ${SRC}") - set_source_files_properties(${SRC} PROPERTIES LANGUAGE HIP) - #set_source_files_properties(${SRC} PROPERTIES LANGUAGE CUDA) - #set_source_files_properties(${SRC} PROPERTIES LANGUAGE ${CUDA_LANG}) - endif() - endforeach() - - message("got here! ${MOD_CUDA_ARCHES}") - set_target_properties(${MOD_NAME} PROPERTIES HIP_ARCHITECTURES - "${MOD_CUDA_ARCHES}") - endif() - set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) + set_target_properties(${MOD_NAME} PROPERTIES ${GPU_LANG}_ARCHITECTURES + "${MOD_GPU_ARCHES}") -# set_target_properties(${MOD_NAME} PROPERTIES LINKER_LANGUAGE CXX) + set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) target_compile_options(${MOD_NAME} PRIVATE - $<$:${MOD_EXTRA_NVCC_FLAGS}>) + $<$:${MOD_EXTRA_GPU_FLAGS}>) target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}") @@ -355,15 +354,7 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS}) - - if (TRUE OR IS_CUDA) - target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) - else() - # -- Python_SOABI=cpython-39-x86_64-linux-gnu - message("got here ${_PYTHON_INCLUDE_DIR}, ${_PYTHON_LIBRARY}") - target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES} ${_Python_LIBRARY}) - target_include_directories(${MOD_NAME} PRIVATE ${Python_INCLUDE_DIRS}) - endif() + target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) endfunction() @@ -383,77 +374,59 @@ set(VLLM_EXT_SRC "csrc/cuda_utils_kernels.cu" "csrc/moe_align_block_size_kernels.cu") -set(VLLM_EXT_ROCM_SRC - "csrc/cache_kernels.hip" - "csrc/attention/attention_kernels.hip" - "csrc/pos_encoding_kernels.hip" - "csrc/activation_kernels.hip" - "csrc/layernorm_kernels.hip" - "csrc/quantization/squeezellm/quant_hip_kernel.hip" - "csrc/quantization/gptq/q_gemm.hip" - "csrc/hip_utils_kernels.hip" - "csrc/moe_align_block_size_kernels.hip") +set(VLLM_EXT_CXX_SRC "csrc/pybind.cpp") if(IS_CUDA) list(APPEND VLLM_EXT_SRC + ${VLLM_EXT_CXX_SRC} "csrc/quantization/awq/gemm_kernels.cu" "csrc/custom_all_reduce.cu") -elseif(HIP_FOUND) - dumpvars("VARS") - # maybe use add_custom_target instead + add_dependencies? - message("build dir ${CMAKE_CURRENT_BINARY_DIR}") +elseif(HIP_FOUND) - set(X) - foreach (SRC ${VLLM_EXT_ROCM_SRC}) - list(APPEND X "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") + # + # Generate ROCM/HIP source file names from CUDA file names. + # + set(VLLM_ROCM_EXT_SRC) + foreach (SRC ${VLLM_EXT_SRC}) + string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) + string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) + list(APPEND VLLM_EXT_ROCM_SRC "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") endforeach() - set(VLLM_EXT_ROCM_SRC ${X}) + message("rocm src: ${VLLM_EXT_ROCM_SRC}") - message("cmake cwd: ${CMAKE_CURRENT_BINARY_DIR}") -# add_custom_command( -# COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO" -# COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc . -# COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC} -# DEPENDS hipify.py ${VLLM_EXT_SRC} -# OUTPUT ${VLLM_EXT_ROCM_SRC} -# COMMENT "run hipify") + # TODO: this needs a bunch more work - message("rocm src: ${VLLM_EXT_ROCM_SRC}") + # add_custom_command( + # COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO" + # COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc . + # COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC} + # DEPENDS hipify.py ${VLLM_EXT_SRC} + # OUTPUT ${VLLM_EXT_ROCM_SRC} + # COMMENT "run hipify") + # TODO: move copy into python (or figure out how to make hipify work properly) + set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( hipify - COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO2" - COMMAND pwd COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR} - COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc -i ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC} + COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${VLLM_EXT_SRC} DEPENDS hipify.py ${VLLM_EXT_SRC} BYPRODUCTS ${VLLM_EXT_ROCM_SRC} - COMMENT "run hipify2") - -# add_custom_command( -# COMMAND ./hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -i csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC} -# DEPENDS hipify.py ${VLLM_EXT_SRC} -# WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} -# OUTPUT ${VLLM_EXT_ROCM_SRC} -# VERBATIM) - - set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC}) - -# set(VLLM_EXT_SRC) -# foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC}) -# list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC}) -# endforeach() - message("final src: ${VLLM_EXT_SRC}") -endif() + COMMENT "Running hipify on extension source files.") + + # Swap out original extension sources with hipified sources. + set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC}) + list(APPEND VLLM_EXT_SRC ${VLLM_EXT_CXX_SRC}) -list(APPEND VLLM_EXT_SRC "csrc/pybind.cpp") # or leave in original list? -#set(VLLM_EXT_SRC "csrc/pybind.cpp") + message(DEBUG "final ext src: ${VLLM_EXT_SRC}") + +endif() define_module_target(_C "${VLLM_EXT_SRC}" - "${VLLM_NVCC_FLAGS}" - "${VLLM_CUDA_ARCHES}") + "${VLLM_GPU_FLAGS}" + "${VLLM_GPU_ARCHES}") # # _moe_C extension @@ -465,8 +438,8 @@ set(VLLM_MOE_EXT_SRC define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" - "${VLLM_NVCC_FLAGS}" - "${VLLM_CUDA_ARCHES}") + "${VLLM_GPU_FLAGS}" + "${VLLM_GPU_ARCHES}") # # _punica_C extension @@ -495,5 +468,5 @@ set(VLLM_PUNICA_EXT_SRC define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" - "${VLLM_PUNICA_NVCC_FLAGS}" - "${VLLM_PUNICA_CUDA_ARCHES}") + "${VLLM_PUNICA_GPU_FLAGS}" + "${VLLM_PUNICA_GPU_ARCHES}") From b56afb1315ad3929e75aca45eddeb8a6380160f6 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 01:24:08 -0500 Subject: [PATCH 37/76] more hipify cleanups --- CMakeLists.txt | 92 +++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 61208af169e84..cc5b1b303941d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,11 +321,57 @@ endif() # Define targets # +# add comment +macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) + # + # Split into C++ and non-C++ (i.e. CUDA) sources + # + set(SRCS ${ORIG_SRCS}) + set(CXX_SRCS ${ORIG_SRCS}) + list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") + list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") + + message(DEBUG "sources: ${SRCS}") + message(DEBUG "cxx sources: ${CXX_SRCS}") + + # + # Generate ROCM/HIP source file names from CUDA file names. + # + set(HIP_SRCS) + foreach (SRC ${SRCS}) + string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) + string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) + list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") + endforeach() + message("rocm src: ${HIP_SRCS}") + + # TODO: hipify script needs a bunch more work + # TODO: move copy into python (or figure out how to make hipify work properly) + set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) + add_custom_target( + hipify${NAME} + COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR} + COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${SRCS} + DEPENDS hipify.py ${SRCS} + BYPRODUCTS ${HIP_SRCS} + COMMENT "Running hipify on extension source files.") + + # Swap out original extension sources with hipified sources. + set(${OUT_SRCS} ${HIP_SRCS}) + list(APPEND ${OUT_SRCS} ${CXX_SRCS}) + + message(DEBUG "final ext src: ${OUT_SRCS}") +endmacro() + # add comment # Note: optimization level/debug info is set via cmake build type. function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) + if (NOT IS_CUDA AND HIP_FOUND) + hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") + endif() + # Note: for ROCm builds we let the proper flags and libraries get # pulled in by torch. Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) @@ -334,10 +380,9 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS set(GPU_LANG "CUDA") else() set(GPU_LANG "HIP") + # Make this target dependent on the hipify preprocessor step. - # TODO: consider moving hipify step into here so it could apply to - # any target - add_dependencies(${MOD_NAME} hipify) + add_dependencies(${MOD_NAME} hipify${MOD_NAME}) endif() set_target_properties(${MOD_NAME} PROPERTIES ${GPU_LANG}_ARCHITECTURES @@ -372,54 +417,17 @@ set(VLLM_EXT_SRC "csrc/quantization/squeezellm/quant_cuda_kernel.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/cuda_utils_kernels.cu" - "csrc/moe_align_block_size_kernels.cu") - -set(VLLM_EXT_CXX_SRC "csrc/pybind.cpp") + "csrc/moe_align_block_size_kernels.cu" + "csrc/pybind.cpp") if(IS_CUDA) list(APPEND VLLM_EXT_SRC - ${VLLM_EXT_CXX_SRC} "csrc/quantization/awq/gemm_kernels.cu" "csrc/custom_all_reduce.cu") elseif(HIP_FOUND) - # - # Generate ROCM/HIP source file names from CUDA file names. - # - set(VLLM_ROCM_EXT_SRC) - foreach (SRC ${VLLM_EXT_SRC}) - string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) - string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) - list(APPEND VLLM_EXT_ROCM_SRC "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") - endforeach() - message("rocm src: ${VLLM_EXT_ROCM_SRC}") - - # TODO: this needs a bunch more work - - # add_custom_command( - # COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO" - # COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc . - # COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC} - # DEPENDS hipify.py ${VLLM_EXT_SRC} - # OUTPUT ${VLLM_EXT_ROCM_SRC} - # COMMENT "run hipify") - - # TODO: move copy into python (or figure out how to make hipify work properly) - set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) - add_custom_target( - hipify - COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR} - COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${VLLM_EXT_SRC} - DEPENDS hipify.py ${VLLM_EXT_SRC} - BYPRODUCTS ${VLLM_EXT_ROCM_SRC} - COMMENT "Running hipify on extension source files.") - - # Swap out original extension sources with hipified sources. - set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC}) - list(APPEND VLLM_EXT_SRC ${VLLM_EXT_CXX_SRC}) - message(DEBUG "final ext src: ${VLLM_EXT_SRC}") endif() From b3f3b554c78c4a0d6d69bf242a5c2b68c9212ad8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 01:29:47 -0500 Subject: [PATCH 38/76] comment --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index cc5b1b303941d..ef4a783388d87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -369,6 +369,7 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) if (NOT IS_CUDA AND HIP_FOUND) + # Add hipify preprocessing step if we are running on AMD. hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") endif() From 7f0c908e1be07e1714bf1a6c3efd03b4ee7163db Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 01:35:29 -0500 Subject: [PATCH 39/76] fix ruff --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ef4a783388d87..3a92952d02bb9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) # -# Supported python verions. These versions will be searched in order, the +# Supported python versions. These versions will be searched in order, the # first match will be selected. # set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") @@ -251,7 +251,7 @@ else() string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) endmacro() - # Initialize the architecure lists to empty. + # Initialize the architecture lists to empty. set(VLLM_GPU_ARCHES) set(VLLM_PUNICA_GPU_ARCHES) From a8a8bd8befd46fb7f565cebf7efe0a643a49db3a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 12:18:56 -0500 Subject: [PATCH 40/76] code cleanups --- CMakeLists.txt | 14 ++++---------- hipify.py | 50 +++++++------------------------------------------- 2 files changed, 11 insertions(+), 53 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a92952d02bb9..3be55b302c2e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,8 @@ foreach(VER ${PYTHON_SUPPORTED_VERSIONS}) message(STATUS "Attempting to find python ${VER} package.") find_package(Python ${VER} COMPONENTS Interpreter Development.Module) if (Python_FOUND) + # Note: find_package(Torch) won't work here because cmake might not + # have the proper search path set yet. execute_process( COMMAND "${Python_EXECUTABLE}" "-c" "import torch" @@ -343,15 +345,12 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") endforeach() - message("rocm src: ${HIP_SRCS}") + message(DEBUG "hip src: ${HIP_SRCS}") - # TODO: hipify script needs a bunch more work - # TODO: move copy into python (or figure out how to make hipify work properly) set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( hipify${NAME} - COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR} - COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${SRCS} + COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} -i ${CMAKE_SOURCE_DIR}/csrc ${SRCS} DEPENDS hipify.py ${SRCS} BYPRODUCTS ${HIP_SRCS} COMMENT "Running hipify on extension source files.") @@ -425,11 +424,6 @@ if(IS_CUDA) list(APPEND VLLM_EXT_SRC "csrc/quantization/awq/gemm_kernels.cu" "csrc/custom_all_reduce.cu") - -elseif(HIP_FOUND) - - - endif() define_module_target(_C diff --git a/hipify.py b/hipify.py index d7cb1a0ed0ed6..282a9062e3836 100755 --- a/hipify.py +++ b/hipify.py @@ -1,13 +1,12 @@ #!/usr/bin/env python3 import argparse +import shutil import os from torch.utils.hipify.hipify_python import hipify if __name__ == '__main__': - print(f"CWD {os.getcwd()}") - parser = argparse.ArgumentParser() parser.add_argument( @@ -37,14 +36,14 @@ args = parser.parse_args() - print(args.output_dir) - # limit scope to build_dir only includes = [os.path.join(args.build_dir, '*')] - print(f"includes {includes}") extra_files = [os.path.abspath(s) for s in args.sources] - print(f"extra_files {extra_files}") + + # Copy sources from project directory to output directory. + # The directory might already exist to hold object files so we ignore that. + shutil.copytree(args.build_dir, args.output_dir, dirs_exist_ok=True) hipify_result = hipify(project_directory=args.build_dir, output_directory=args.output_dir, @@ -55,8 +54,6 @@ is_pytorch_extension=True, hipify_extra_files_only=True) - #print(hipify_result) - hipified_sources = [] for source in args.sources: s_abs = os.path.abspath(source) @@ -74,38 +71,5 @@ assert (len(hipified_sources) == len(args.sources)) - # print("\n".join(hipified_sources)) - -# print(f"got here {args.output_dir}") -# os.system(f"find {args.output_dir} -name '*.hip'") -# print("end got here") - -# print(f"got here root") -# os.system(f"find /app/vllm -name '*.hip'") -# print("end got here root") - -# project_directory /app/vllm -# show_detailed True -# extensions ('.cu', '.cuh', '.c', '.cc', '.cpp', '.h', '.in', '.hpp') -# header_extensions ('.cuh', '.h', '.hpp') -# output_directory /app/vllm -# header_include_dirs [] -# includes ['/app/vllm/*'] -# extra_files [ -# '/app/vllm/csrc/cache_kernels.cu', -# '/app/vllm/csrc/attention/attention_kernels.cu', -# '/app/vllm/csrc/pos_encoding_kernels.cu', -# '/app/vllm/csrc/activation_kernels.cu', -# '/app/vllm/csrc/layernorm_kernels.cu', -# '/app/vllm/csrc/quantization/squeezellm/quant_cuda_kernel.cu', -# '/app/vllm/csrc/quantization/gptq/q_gemm.cu', -# '/app/vllm/csrc/cuda_utils_kernels.cu', -# '/app/vllm/csrc/moe_align_block_size_kernels.cu', -# '/app/vllm/csrc/pybind.cpp' -# ] -# out_of_place_only False -# ignores () -# show_progress True -# hip_clang_launch False -# is_pytorch_extension True -# hipify_extra_files_only True + # Print hipified source files. + print("\n".join(hipified_sources)) From d9ed8b93daa5b4fa3c322faeda67390f63f69512 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 13:15:24 -0500 Subject: [PATCH 41/76] add comments --- CMakeLists.txt | 69 ++++++++++++++++++++++++++++++++------------------ hipify.py | 42 +++++++++++++++--------------- 2 files changed, 65 insertions(+), 46 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3be55b302c2e2..b084768041ac3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(vllm_extensions LANGUAGES CXX) # # Supported python versions. These versions will be searched in order, the -# first match will be selected. +# first match is be selected. # set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") @@ -25,6 +25,8 @@ foreach(VER ${PYTHON_SUPPORTED_VERSIONS}) message(STATUS "Attempting to find python ${VER} package.") find_package(Python ${VER} COMPONENTS Interpreter Development.Module) if (Python_FOUND) + # Attempt to import torch from python we just found. If so, stop + # searching for other versions of python. # Note: find_package(Torch) won't work here because cmake might not # have the proper search path set yet. execute_process( @@ -66,7 +68,7 @@ macro (run_python OUT EXPR ERR_MSG) endmacro() # -# Try to find MPI package +# Try to find the MPI package # find_package(MPI) @@ -103,6 +105,9 @@ if ((NOT HIP_FOUND) AND (NOT CUDA_FOUND)) message(FATAL_ERROR "Can't find CUDA or HIP installation.") endif() +# +# Check the torch version and warn if it isn't what is expected. +# if (NOT HIP_FOUND AND CUDA_FOUND) set(IS_CUDA true) @@ -134,15 +139,18 @@ endif() # detect them explicitly with check_language, etc. # if (HIP_FOUND) - # Importing torch recognizes and sets up some HIP/ROCm stuff but not all. - # If we want cmake to be able to understand the .hip extension automatically, - # we need to enable HIP explicitly. + # Importing torch recognizes and sets up some HIP/ROCm configuration but + # does not let cmake recognize .hip files. If we want cmake to be able to + # understand the .hip extension automatically, we need to enable HIP + # explicitly. enable_language(HIP) # - # VLLM_HIP_ARCHITECUTRES will control the offload-arch flags. - # CMAKE_HIP_ARCHITECTURES is setup by pytorch and can be controlled + # VLLM_HIP_ARCHITECUTRES controls the --offload-arch flags. + # CMAKE_HIP_ARCHITECTURES is set up by pytorch and can be controlled # via the PYTORCH_ROCM_ARCH env variable. + # + # # Find the intersection of the supported + detected architectures to # set the module architecture flags. @@ -172,16 +180,17 @@ if (HIP_FOUND) "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))" "Failed to determine torch nvcc compiler flags") - list(APPEND VLLM_GPU_FLAGS ${VLLM_HIPCC_FLAGS}) - list(APPEND VLLM_GPU_FLAGS + ${VLLM_HIPCC_FLAGS} "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__" "-fno-gpu-rdc") else() + # # Get common NVCC flags from torch. + # run_python(VLLM_GPU_FLAGS "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" "Failed to determine torch nvcc compiler flags") @@ -199,7 +208,6 @@ else() # # Copy flags+update for punica # - list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS "-D__CUDA_NO_HALF_OPERATORS__" "-D__CUDA_NO_HALF_CONVERSIONS__" @@ -213,13 +221,13 @@ else() # Setup/process CUDA arch flags # # The torch cmake setup detects and hardcodes the detected architecture flags - # in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported - # architectures and the punica target. So we have to extract and remove all - # the '-gencode' flags from CMAKE_CUDA_FLAGS for processing. We can't use - # 'target_compiler_options' for adding '-gencode' arguments so we will use the - # target's CUDA_ARCHITECTURES property instead. This requires repackaging - # the architecture flags into a format that cmake expects for - # CUDA_ARCHITECTURES. + # in CMAKE_CUDA_FLAGS. Since CMAKE_CUDA_FLAGS is a "global" variable, we can't + # modify it on a per-target basis, i.e. for the punica extension. + # So we have to extract and remove all the '-gencode' flags from + # CMAKE_CUDA_FLAGS for processing. We can't use 'target_compiler_options' + # for adding '-gencode' arguments so we use the target's CUDA_ARCHITECTURES + # property instead. This requires repackaging the architecture flags into a + # format that cmake expects for CUDA_ARCHITECTURES. # # This is a bit fragile in that it depends on torch using -gencode as opposed # to one of the other nvcc options to specify architectures. @@ -261,7 +269,7 @@ else() foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) # For each flag we want to extract the version number and whether # it refers to PTX or native code. - # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding + # Note: if a regex matches then CMAKE_MATCH_1 holds the binding # for that match. string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) @@ -323,7 +331,11 @@ endif() # Define targets # -# add comment +# +# Add a target named `NAME` that runs the hipify preprocessor on a set of +# CUDA source files. The names of the corresponding "hipified" sources +# are stored in `OUT_SRCS`. +# macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) # # Split into C++ and non-C++ (i.e. CUDA) sources @@ -350,7 +362,7 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( hipify${NAME} - COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} -i ${CMAKE_SOURCE_DIR}/csrc ${SRCS} + COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} DEPENDS hipify.py ${SRCS} BYPRODUCTS ${HIP_SRCS} COMMENT "Running hipify on extension source files.") @@ -362,18 +374,27 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) message(DEBUG "final ext src: ${OUT_SRCS}") endmacro() -# add comment +# +# Define a target named `MOD_NAME` for a single extension. The +# arguments are: +# +# MOD_SRC - the list of source files relative to CMakeLists.txt +# directory. +# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. +# MOD_GPU_ARCHES - a list of target GPU architectures in cmake format. +# Refer to cmake documentation on CMAKE_CUDA_ARCHITECTURES +# and CMAKE_HIP_ARCHITECTURES for more info. +# # Note: optimization level/debug info is set via cmake build type. +# function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) + # Add hipify preprocessing step if we are building with HIP/ROCm. if (NOT IS_CUDA AND HIP_FOUND) - # Add hipify preprocessing step if we are running on AMD. hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") endif() - # Note: for ROCm builds we let the proper flags and libraries get - # pulled in by torch. Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) if (IS_CUDA) diff --git a/hipify.py b/hipify.py index 282a9062e3836..c4d8450630ba3 100755 --- a/hipify.py +++ b/hipify.py @@ -1,5 +1,13 @@ #!/usr/bin/env python3 +# +# A command line tool for running pytorch's hipify preprocessor on CUDA +# source files. +# +# See https://github.com/ROCm/hipify_torch +# and /utils/hipify/hipify_python.py +# + import argparse import shutil import os @@ -9,26 +17,21 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() + # Project directory where all the source + include files live. parser.add_argument( - "-b", - "--build_dir", - help="The build directory.", + "-p", + "--project_dir", + help="The project directory.", ) + # Directory where hipified files are written. parser.add_argument( "-o", "--output_dir", help="The output directory.", ) - parser.add_argument( - "-i", - "--include_dir", - help="Include directory", - action="append", - default=[], - ) - + # Source files to convert. parser.add_argument("sources", help="Source files to hipify.", nargs="*", @@ -36,16 +39,17 @@ args = parser.parse_args() - # limit scope to build_dir only - includes = [os.path.join(args.build_dir, '*')] + # Limit include scope to project_dir only + includes = [os.path.join(args.project_dir, '*')] + # Get absolute path for all source files. extra_files = [os.path.abspath(s) for s in args.sources] # Copy sources from project directory to output directory. # The directory might already exist to hold object files so we ignore that. - shutil.copytree(args.build_dir, args.output_dir, dirs_exist_ok=True) + shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) - hipify_result = hipify(project_directory=args.build_dir, + hipify_result = hipify(project_directory=args.project_dir, output_directory=args.output_dir, header_include_dirs=[], includes=includes, @@ -61,13 +65,7 @@ (s_abs in hipify_result and hipify_result[s_abs].hipified_path is not None) else s_abs) - if True: - hipified_sources.append(hipified_s_abs) - else: - hipified_sources.append( - os.path.relpath( - hipified_s_abs, - os.path.abspath(os.path.join(args.build_dir, os.pardir)))) + hipified_sources.append(hipified_s_abs) assert (len(hipified_sources) == len(args.sources)) From 55b73e9d317feeac6406886499f3348a863c797f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 13:17:17 -0500 Subject: [PATCH 42/76] tweaks --- setup.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 006897de3812b..ac2990e90654a 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def is_ccache_available() -> bool: - return which("ccacheX") is not None + return which("ccache") is not None def is_ninja_available() -> bool: @@ -71,8 +71,7 @@ def build_extensions(self): '--log-level=TRACE', ] - # TODO: change default to 0 - verbose = bool(int(os.getenv('VERBOSE', '1'))) + verbose = bool(int(os.getenv('VERBOSE', '0'))) if verbose: cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] From 298fbf242272f8bc5e113b2866c528bf5cbecf11 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 14:39:33 -0500 Subject: [PATCH 43/76] restore accidentally removed comment --- requirements-build.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-build.txt b/requirements-build.txt index 8975f477fe96c..a8efcde590bbf 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,3 +1,4 @@ +# Should be mirrored in pyproject.toml cmake>=3.21 ninja packaging From 9bb0aebf5f8dd805f858c2aa66c38a477aee6059 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 22 Feb 2024 23:50:41 -0500 Subject: [PATCH 44/76] remove cmake logging --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index ac2990e90654a..1d0a133b8ce69 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,6 @@ def build_extensions(self): # temporary build directory instead '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format( cfg.upper(), self.build_temp), - '--log-level=TRACE', ] verbose = bool(int(os.getenv('VERBOSE', '0'))) From 17349a58a465fcc849e49a701422d0d2760434e8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 23 Feb 2024 00:43:10 -0500 Subject: [PATCH 45/76] add 'supported' target --- CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b084768041ac3..501069e9131cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -494,3 +494,47 @@ define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_GPU_FLAGS}" "${VLLM_PUNICA_GPU_ARCHES}") + +# +# Add 'supported' target which detects which extensions should be +# built based on platform/architecture. This is the same logic that +# setup.py uses to select which extensions should be built. +# +# The 'supported' target makes direct use of cmake easier since knowledge +# of which extensions are supported have been factored in, e.g. +# +# cmake --build . --target supported +# +add_custom_target(supported) + +if (IS_CUDA OR HIP_FOUND) + message(STATUS "Enabling C extension.") + add_dependencies(supported _C) +endif() + +if (IS_CUDA) + message(STATUS "Enabling moe extension.") + add_dependencies(supported _moe_C) + + set(ENABLE_PUNICA) + # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=1 or + # VLLM_INSTALL_PUNICA_KERNELS is set in the environment. + if (DEFINED VLLM_INSTALL_PUNICA_KERNELS OR ENV{VLLM_INSTALL_PUNICA_KERNELS}) + set(ENABLE_PUNICA true) + foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE}) + string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH}) + string_to_ver(ARCH_VER ${ARCH_VER_STR}) + if (ARCH_VER VERSION_LESS 8.0) + message(STATUS + "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.") + set(ENABLE_PUNICA false) + break() + endif() + endforeach() + endif() + + if (ENABLE_PUNICA) + message(STATUS "Enabling punica extension.") + add_dependencies(supported _punica_C) + endif() +endif() From 6fa22b51066a784a5736b48b24c865b77a077942 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Sat, 24 Feb 2024 00:15:06 -0500 Subject: [PATCH 46/76] cleanup comments, add variables for supported torch versions --- CMakeLists.txt | 193 ++++++++++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 84 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 501069e9131cb..e8d8d573c5d04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,29 +4,43 @@ project(vllm_extensions LANGUAGES CXX) # # Supported python versions. These versions will be searched in order, the -# first match is be selected. +# first match will be selected. These should be kept in sync with setup.py. # set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") -# Supported NVIDIA architectures +# Supported NVIDIA architectures. set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") -# Supported AMD GPU architectures +# Supported AMD GPU architectures. set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") # -# Loop thru all supported python versions until we find the first suitable -# version that has torch installed. +# Supported/expected torch versions for CUDA/ROCm. # -# Cmake is unable to pick the lowest supported version when multiple -# versions are available, even with CMAKE_FIND_PACKAGE_SORT_ORDER. +# Currently, having an incorrect pytorch version results in a warning +# rather than an error. +# +# Note: the CUDA torch version is derived from pyproject.toml and various +# requirements.txt files and should be kept consistent. The ROCm torch +# versions are derived from Dockerfile.rocm +# +set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2") +set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") +set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") + +# +# Loop through all supported python versions until the first suitable version +# that has torch installed. +# +# Note: cmake is unable to pick the lowest supported version when multiple +# versions are available, even with `CMAKE_FIND_PACKAGE_SORT_ORDER`. # foreach(VER ${PYTHON_SUPPORTED_VERSIONS}) message(STATUS "Attempting to find python ${VER} package.") find_package(Python ${VER} COMPONENTS Interpreter Development.Module) if (Python_FOUND) - # Attempt to import torch from python we just found. If so, stop - # searching for other versions of python. + # Attempt to import torch from python was just found. If torch can + # be imported, stop searching for other versions of python. # Note: find_package(Torch) won't work here because cmake might not # have the proper search path set yet. execute_process( @@ -37,7 +51,8 @@ foreach(VER ${PYTHON_SUPPORTED_VERSIONS}) ERROR_VARIABLE PYTHON_STDERR) if(PYTHON_ERROR_CODE EQUAL 0) - message(STATUS "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).") + message(STATUS + "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).") break() endif() endif() @@ -45,13 +60,14 @@ endforeach() if (NOT Python_FOUND) message(FATAL_ERROR - "No supported version of python found. ('${PYTHON_SUPPORTED_VERSIONS}')") + "No supported version of python (with pytorch) found. " + "('${PYTHON_SUPPORTED_VERSIONS}')") endif() # -# Run EXPR in python. The standard output of python is stored in OUT and has -# trailing whitespace stripped. If an error is encountered when running python, -# a fatal message ERR_MSG is issued. +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, a fatal message `ERR_MSG` is issued. # macro (run_python OUT EXPR ERR_MSG) execute_process( @@ -68,36 +84,38 @@ macro (run_python OUT EXPR ERR_MSG) endmacro() # -# Try to find the MPI package +# Try to find the MPI package. # find_package(MPI) # -# Find where user site-packages and torch are installed and add it to cmake's -# search path. +# Update cmake's `CMAKE_PREFIX_PATH` with probably torch locations. # -# Run EXPR in python after importing PKG. Use the result of this to extend -# CMAKE_PREFIX_PATH so we can import the torch cmake configuration. +# Run `EXPR` in python after importing `PKG`. Use the result of this to extend +# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. macro (append_cmake_prefix_path PKG EXPR) run_python(PREFIX_PATH "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) endmacro() -# Add user site-packages and torch path to CMAKE_PREFIX_PATH +# Add user site-packages path to `CMAKE_PREFIX_PATH`. append_cmake_prefix_path("site" "site.getusersitepackages()") + +# Query torch for its install path and add to `CMAKE_PREFIX_PATH`. append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") # # Import torch cmake configuration. -# Torch also imports CUDA/HIP packages with some customizations, so we do not -# need to do this explicitly with check_language/enable_language, etc. +# Torch also imports CUDA (and partially HIP) languages with some customizations, +# so there is no need to do this explicitly with check_language/enable_language, +# etc. # find_package(Torch REQUIRED) # For some reason torch does not add libtorch_python.so to the list of torch -# libraries to link. Find it by hand using 'append_torchlib_if_found' from +# libraries to link. Find it by hand using `append_torchlib_if_found` from # torch's cmake setup. append_torchlib_if_found(torch_python) @@ -111,23 +129,23 @@ endif() if (NOT HIP_FOUND AND CUDA_FOUND) set(IS_CUDA true) - # Verify torch version and warn if it is not expected. - if (NOT Torch_VERSION VERSION_EQUAL 2.1.2) - message(WARNING "Pytorch version 2.1.2 expected for CUDA build, " - "saw ${Torch_VERSION} instead.") + if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} " + "expected for CUDA build, saw ${Torch_VERSION} instead.") endif() else() - # Verify torch version and warn if it is not expected (derived from Dockerfile.rocm) - # ROCm 5.7 -> torch 2.0.1 - if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND NOT Torch_VERSION VERSION_EQUAL 2.0.1) - message(WARNING "Pytorch version 2.0.1 expected for ROCMm 5.x build, " - "saw ${Torch_VERSION} instead.") + # ROCm 5.x + if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND + NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} " + "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.") endif() - # ROCm 6.0 -> torch 2.1.1 - if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND NOT Torch_VERSION VERSION_EQUAL 2.1.1) - message(WARNING "Pytorch version 2.1.1 expected for ROCMm 6.x build, " - "saw ${Torch_VERSION} instead.") + # ROCm 6.x + if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND + NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} " + "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.") endif() endif() @@ -135,20 +153,19 @@ endif() # Setup extra platform specific GPU compilation flags, e.g. NVCC flags for CUDA # and hip flags for ROCm. # -# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat -# detect them explicitly with check_language, etc. +# Note: CUDA (and partially HIP) is detected by pytorch package so there's no +# need to repeat detecting it explicitly with check_language, etc. # if (HIP_FOUND) - # Importing torch recognizes and sets up some HIP/ROCm configuration but - # does not let cmake recognize .hip files. If we want cmake to be able to - # understand the .hip extension automatically, we need to enable HIP - # explicitly. + # Importing torch recognizes and sets up some HIP/ROCm configuration but does + # not let cmake recognize .hip files. In order to get cmake to understand the + # .hip extension automatically, HIP must be enabled explicitly. enable_language(HIP) # - # VLLM_HIP_ARCHITECUTRES controls the --offload-arch flags. - # CMAKE_HIP_ARCHITECTURES is set up by pytorch and can be controlled - # via the PYTORCH_ROCM_ARCH env variable. + # `VLLM_HIP_ARCHITECUTRES` controls the `--offload-arch` flags. + # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled + # via the `PYTORCH_ROCM_ARCH` env variable. # # @@ -164,8 +181,8 @@ if (HIP_FOUND) if(NOT VLLM_HIP_ARCHITECTURES) message(FATAL_ERROR - "None of the detected ROCM architectures: ${CMAKE_HIP_ARCHITECTURES} is" - " supported. Supported ROCM architectures are: ${ROCM_SUPPORTED_ARCHS}.") + "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" + " supported. Supported ROCm architectures are: ${ROCM_SUPPORTED_ARCHS}.") endif() set(VLLM_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES}) @@ -218,31 +235,32 @@ else() message(DEBUG "punica nvcc: ${VLLM_PUNICA_GPU_FLAGS}") # - # Setup/process CUDA arch flags + # Setup/process CUDA arch flags. # - # The torch cmake setup detects and hardcodes the detected architecture flags - # in CMAKE_CUDA_FLAGS. Since CMAKE_CUDA_FLAGS is a "global" variable, we can't - # modify it on a per-target basis, i.e. for the punica extension. - # So we have to extract and remove all the '-gencode' flags from - # CMAKE_CUDA_FLAGS for processing. We can't use 'target_compiler_options' - # for adding '-gencode' arguments so we use the target's CUDA_ARCHITECTURES - # property instead. This requires repackaging the architecture flags into a - # format that cmake expects for CUDA_ARCHITECTURES. + # The torch cmake setup hardcodes the detected architecture flags in + # `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it + # can't modified on a per-target basis, e.g. for the `punica` extension. + # So, all the `-gencode` flags need to be extracted and removed from + # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method. + # Since it's not possible to use `target_compiler_options` for adding target + # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property + # must be used instead. This requires repackaging the architecture flags + # into a format that cmake expects for `CUDA_ARCHITECTURES`. # - # This is a bit fragile in that it depends on torch using -gencode as opposed + # This is a bit fragile in that it depends on torch using `-gencode` as opposed # to one of the other nvcc options to specify architectures. # - # Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override + # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override # detected architectures. # message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - # Extract all '-gencode' flags from CMAKE_CUDA_FLAGS + # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS` string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS ${CMAKE_CUDA_FLAGS}) - # Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying - # them and passing them back in via the CUDA_ARCHITECTURES property. + # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified + # and passed back via the `CUDA_ARCHITECTURES` property. string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) @@ -256,7 +274,7 @@ else() message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}") - # Macro for converting a 'gencode' version number to a cmake version number. + # Macro for converting a `gencode` version number to a cmake version number. macro(string_to_ver OUT_VER IN_STR) string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) endmacro() @@ -265,11 +283,11 @@ else() set(VLLM_GPU_ARCHES) set(VLLM_PUNICA_GPU_ARCHES) - # Process each 'gencode' flag. + # Process each `gencode` flag. foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) - # For each flag we want to extract the version number and whether - # it refers to PTX or native code. - # Note: if a regex matches then CMAKE_MATCH_1 holds the binding + # For each flag, extract the version number and whether it refers to PTX + # or native code. + # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding # for that match. string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) @@ -307,17 +325,17 @@ else() set(CODE_ARCH ${CODE}) endif() - # Check if the current version is in the supported arch list + # Check if the current version is in the supported arch list. string_to_ver(CODE_VER ${CODE_ARCH}) if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) message(STATUS "discarding unsupported CUDA arch ${VER}.") continue() endif() - # Add it to the arch list + # Add it to the arch list. list(APPEND VLLM_GPU_ARCHES "${CODE_ARCH}${VIRT}") - # Add it to punica arch list if the version is >= 8.0 + # Add it to punica arch list if the version is >= 8.0. if (CODE_VER GREATER_EQUAL 8.0) list(APPEND VLLM_PUNICA_GPU_ARCHES "${CODE_ARCH}${VIRT}") endif() @@ -328,17 +346,17 @@ else() endif() # -# Define targets +# Define extension targets # # -# Add a target named `NAME` that runs the hipify preprocessor on a set of -# CUDA source files. The names of the corresponding "hipified" sources -# are stored in `OUT_SRCS`. +# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set +# of CUDA source files. The names of the corresponding "hipified" sources are +# stored in `OUT_SRCS`. # macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) # - # Split into C++ and non-C++ (i.e. CUDA) sources + # Split into C++ and non-C++ (i.e. CUDA) sources. # set(SRCS ${ORIG_SRCS}) set(CXX_SRCS ${ORIG_SRCS}) @@ -349,7 +367,9 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) message(DEBUG "cxx sources: ${CXX_SRCS}") # - # Generate ROCM/HIP source file names from CUDA file names. + # Generate ROCm/HIP source file names from CUDA file names. + # Since HIP files are generated code, they will appear in the build area + # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. # set(HIP_SRCS) foreach (SRC ${SRCS}) @@ -365,7 +385,7 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} DEPENDS hipify.py ${SRCS} BYPRODUCTS ${HIP_SRCS} - COMMENT "Running hipify on extension source files.") + COMMENT "Running hipify on ${NAME} extension source files.") # Swap out original extension sources with hipified sources. set(${OUT_SRCS} ${HIP_SRCS}) @@ -382,15 +402,17 @@ endmacro() # directory. # MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. # MOD_GPU_ARCHES - a list of target GPU architectures in cmake format. -# Refer to cmake documentation on CMAKE_CUDA_ARCHITECTURES -# and CMAKE_HIP_ARCHITECTURES for more info. +# Refer to documentation on `CMAKE_CUDA_ARCHITECTURES` +# and `CMAKE_HIP_ARCHITECTURES` for more info. # # Note: optimization level/debug info is set via cmake build type. # +# TODO: consider passing the language (CUDA/HIP/etc.) as an argument. +# function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) - # Add hipify preprocessing step if we are building with HIP/ROCm. + # Add hipify preprocessing step when building with HIP/ROCm. if (NOT IS_CUDA AND HIP_FOUND) hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") endif() @@ -496,13 +518,16 @@ define_module_target(_punica_C "${VLLM_PUNICA_GPU_ARCHES}") # -# Add 'supported' target which detects which extensions should be -# built based on platform/architecture. This is the same logic that -# setup.py uses to select which extensions should be built. +# Add the `supported` target which detects which extensions should be +# built based on platform/architecture. This is the same logic that +# setup.py uses to select which extensions should be built and should +# be kept in sync. # -# The 'supported' target makes direct use of cmake easier since knowledge -# of which extensions are supported have been factored in, e.g. +# The `supported` target makes direct use of cmake easier since knowledge +# of which extensions are supported has been factored in, e.g. # +# mkdir build && cd build +# cmake -G Ninja .. # cmake --build . --target supported # add_custom_target(supported) From ed3f191a8b9541212e89e0b3ab36f2f97d2d23cb Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 26 Feb 2024 17:56:40 +0000 Subject: [PATCH 47/76] replace IS_CUDA with VLLM_GPU_LANG --- CMakeLists.txt | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e8d8d573c5d04..c99bccd15d296 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -119,21 +119,20 @@ find_package(Torch REQUIRED) # torch's cmake setup. append_torchlib_if_found(torch_python) -if ((NOT HIP_FOUND) AND (NOT CUDA_FOUND)) - message(FATAL_ERROR "Can't find CUDA or HIP installation.") -endif() - # -# Check the torch version and warn if it isn't what is expected. +# Set up GPU language and check the torch version and warn if it isn't +# what is expected. # if (NOT HIP_FOUND AND CUDA_FOUND) - set(IS_CUDA true) + set(VLLM_GPU_LANG "CUDA") if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA}) message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} " "expected for CUDA build, saw ${Torch_VERSION} instead.") endif() -else() +elseif(HIP_FOUND) + set(VLLM_GPU_LANG "HIP") + # ROCm 5.x if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X}) @@ -147,6 +146,8 @@ else() message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} " "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.") endif() +else() + message(FATAL_ERROR "Can't find CUDA or HIP installation.") endif() # @@ -407,34 +408,28 @@ endmacro() # # Note: optimization level/debug info is set via cmake build type. # -# TODO: consider passing the language (CUDA/HIP/etc.) as an argument. -# -function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS +function(define_module_target MOD_NAME MOD_GPU_LANG MOD_SRC MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) # Add hipify preprocessing step when building with HIP/ROCm. - if (NOT IS_CUDA AND HIP_FOUND) + if (MOD_GPU_LANG STREQUAL "HIP") hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") endif() Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - if (IS_CUDA) - set(GPU_LANG "CUDA") - else() - set(GPU_LANG "HIP") - + if (MOD_GPU_LANG STREQUAL "HIP") # Make this target dependent on the hipify preprocessor step. add_dependencies(${MOD_NAME} hipify${MOD_NAME}) endif() - set_target_properties(${MOD_NAME} PROPERTIES ${GPU_LANG}_ARCHITECTURES + set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES "${MOD_GPU_ARCHES}") set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) target_compile_options(${MOD_NAME} PRIVATE - $<$:${MOD_EXTRA_GPU_FLAGS}>) + $<$:${MOD_EXTRA_GPU_FLAGS}>) target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}") @@ -463,13 +458,14 @@ set(VLLM_EXT_SRC "csrc/moe_align_block_size_kernels.cu" "csrc/pybind.cpp") -if(IS_CUDA) +if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC "csrc/quantization/awq/gemm_kernels.cu" "csrc/custom_all_reduce.cu") endif() define_module_target(_C + "${VLLM_GPU_LANG}" "${VLLM_EXT_SRC}" "${VLLM_GPU_FLAGS}" "${VLLM_GPU_ARCHES}") @@ -483,6 +479,7 @@ set(VLLM_MOE_EXT_SRC "csrc/moe/topk_softmax_kernels.cu") define_module_target(_moe_C + "${VLLM_GPU_LANG}" "${VLLM_MOE_EXT_SRC}" "${VLLM_GPU_FLAGS}" "${VLLM_GPU_ARCHES}") @@ -513,6 +510,7 @@ set(VLLM_PUNICA_EXT_SRC "csrc/punica/punica_ops.cc") define_module_target(_punica_C + "${VLLM_GPU_LANG}" "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_GPU_FLAGS}" "${VLLM_PUNICA_GPU_ARCHES}") @@ -532,12 +530,12 @@ define_module_target(_punica_C # add_custom_target(supported) -if (IS_CUDA OR HIP_FOUND) +if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") message(STATUS "Enabling C extension.") add_dependencies(supported _C) endif() -if (IS_CUDA) +if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Enabling moe extension.") add_dependencies(supported _moe_C) From d9cc84032b8276020edf7793341f6fb5a7d0ce7c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 26 Feb 2024 18:06:51 +0000 Subject: [PATCH 48/76] update comment + remove some debug logging --- CMakeLists.txt | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c99bccd15d296..9082a28db5be6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -232,9 +232,6 @@ else() "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" "-D__CUDA_NO_HALF2_OPERATORS__") - message(DEBUG "nvcc: ${VLLM_GPU_FLAGS}") - message(DEBUG "punica nvcc: ${VLLM_PUNICA_GPU_FLAGS}") - # # Setup/process CUDA arch flags. # @@ -273,6 +270,7 @@ else() "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})") endif() + message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}") # Macro for converting a `gencode` version number to a cmake version number. @@ -364,9 +362,6 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") - message(DEBUG "sources: ${SRCS}") - message(DEBUG "cxx sources: ${CXX_SRCS}") - # # Generate ROCm/HIP source file names from CUDA file names. # Since HIP files are generated code, they will appear in the build area @@ -378,7 +373,6 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") endforeach() - message(DEBUG "hip src: ${HIP_SRCS}") set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( @@ -391,14 +385,13 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) # Swap out original extension sources with hipified sources. set(${OUT_SRCS} ${HIP_SRCS}) list(APPEND ${OUT_SRCS} ${CXX_SRCS}) - - message(DEBUG "final ext src: ${OUT_SRCS}") endmacro() # # Define a target named `MOD_NAME` for a single extension. The # arguments are: # +# MOD_GPU_LANG - the GPU language for this module, e.g CUDA, HIP, etc. # MOD_SRC - the list of source files relative to CMakeLists.txt # directory. # MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. From 3999ed2f017fc39947ccef20fe87c0df7d966247 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 26 Feb 2024 22:50:17 +0000 Subject: [PATCH 49/76] review comments + some tweaks to setup.py --- CMakeLists.txt | 21 ++++++++------------- setup.py | 30 +++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9082a28db5be6..f7381c3051bf9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,12 +84,7 @@ macro (run_python OUT EXPR ERR_MSG) endmacro() # -# Try to find the MPI package. -# -find_package(MPI) - -# -# Update cmake's `CMAKE_PREFIX_PATH` with probably torch locations. +# Update cmake's `CMAKE_PREFIX_PATH` with probable torch locations. # # Run `EXPR` in python after importing `PKG`. Use the result of this to extend @@ -509,28 +504,28 @@ define_module_target(_punica_C "${VLLM_PUNICA_GPU_ARCHES}") # -# Add the `supported` target which detects which extensions should be +# Add the `default` target which detects which extensions should be # built based on platform/architecture. This is the same logic that # setup.py uses to select which extensions should be built and should # be kept in sync. # -# The `supported` target makes direct use of cmake easier since knowledge +# The `default` target makes direct use of cmake easier since knowledge # of which extensions are supported has been factored in, e.g. # # mkdir build && cd build # cmake -G Ninja .. -# cmake --build . --target supported +# cmake --build . --target default # -add_custom_target(supported) +add_custom_target(default) if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP") message(STATUS "Enabling C extension.") - add_dependencies(supported _C) + add_dependencies(default _C) endif() if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Enabling moe extension.") - add_dependencies(supported _moe_C) + add_dependencies(default _moe_C) set(ENABLE_PUNICA) # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=1 or @@ -551,6 +546,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (ENABLE_PUNICA) message(STATUS "Enabling punica extension.") - add_dependencies(supported _punica_C) + add_dependencies(default _punica_C) endif() endif() diff --git a/setup.py b/setup.py index 1d0a133b8ce69..e6f1ada774a89 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,10 @@ MAIN_CUDA_VERSION = "12.1" +def is_sccache_available() -> bool: + return which("sccache") is not None + + def is_ccache_available() -> bool: return which("ccache") is not None @@ -42,6 +46,8 @@ def __init__(self, name, cmake_lists_dir='.', **kwa): class cmake_build_ext(build_ext): + # A flag to ensure that the cmake config step only runs once. + did_config = False def build_extensions(self): # Ensure that CMake is present and working @@ -74,7 +80,12 @@ def build_extensions(self): if verbose: cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] - if is_ccache_available(): + if is_sccache_available(): + cmake_args += [ + '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', + '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', + ] + elif is_ccache_available(): cmake_args += [ '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', @@ -83,7 +94,11 @@ def build_extensions(self): # # Setup parallelism # - num_jobs = os.cpu_count() + try: + num_jobs = len(os.sched_getaffinity(0)) + except AttributeError: + num_jobs = os.cpu_count() + nvcc_cuda_version = get_nvcc_cuda_version() if nvcc_cuda_version >= Version("11.2"): nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) @@ -107,16 +122,17 @@ def build_extensions(self): build_jobs = ['-j', str(num_jobs)] # Config - # TODO: this only needs to happen once - subprocess.check_call(['cmake', ext.cmake_lists_dir] + build_tool + - cmake_args, - cwd=self.build_temp) + if not cmake_build_ext.did_config: + cmake_build_ext.did_config = True + subprocess.check_call( + ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], + cwd=self.build_temp) # Build build_args = [ '--build', '.', '--config', cfg, '--target', ext_target_name ] - subprocess.check_call(['cmake'] + build_args + build_jobs, + subprocess.check_call(['cmake', *build_args, *build_jobs], cwd=self.build_temp) From c669467c8cc618706062f1a0a9924f47d79f253e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 27 Feb 2024 19:53:12 +0000 Subject: [PATCH 50/76] move utilities to utils.cmake, change find python process to use binary, remove unneeded user site packages from path --- CMakeLists.txt | 173 ++++++------------------------------------------- setup.py | 7 ++ utils.cmake | 135 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 154 deletions(-) create mode 100644 utils.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index f7381c3051bf9..e2106d1162f7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) +include(utils.cmake) + # # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. @@ -29,76 +31,20 @@ set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") # -# Loop through all supported python versions until the first suitable version -# that has torch installed. -# -# Note: cmake is unable to pick the lowest supported version when multiple -# versions are available, even with `CMAKE_FIND_PACKAGE_SORT_ORDER`. +# Try to find python package with an executable that exactly matches +# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions. # -foreach(VER ${PYTHON_SUPPORTED_VERSIONS}) - message(STATUS "Attempting to find python ${VER} package.") - find_package(Python ${VER} COMPONENTS Interpreter Development.Module) - if (Python_FOUND) - # Attempt to import torch from python was just found. If torch can - # be imported, stop searching for other versions of python. - # Note: find_package(Torch) won't work here because cmake might not - # have the proper search path set yet. - execute_process( - COMMAND - "${Python_EXECUTABLE}" "-c" "import torch" - OUTPUT_VARIABLE PYTHON_OUT - RESULT_VARIABLE PYTHON_ERROR_CODE - ERROR_VARIABLE PYTHON_STDERR) - - if(PYTHON_ERROR_CODE EQUAL 0) - message(STATUS - "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).") - break() - endif() - endif() -endforeach() - -if (NOT Python_FOUND) +if (VLLM_PYTHON_EXECUTABLE) + find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}") +else() message(FATAL_ERROR - "No supported version of python (with pytorch) found. " - "('${PYTHON_SUPPORTED_VERSIONS}')") + "Please set VLLM_PYTHON_EXECUTABLE to the desired python version before " + "running cmake configure.") endif() # -# Run `EXPR` in python. The standard output of python is stored in `OUT` and -# has trailing whitespace stripped. If an error is encountered when running -# python, a fatal message `ERR_MSG` is issued. +# Update cmake's `CMAKE_PREFIX_PATH` with torch location. # -macro (run_python OUT EXPR ERR_MSG) - execute_process( - COMMAND - "${Python_EXECUTABLE}" "-c" "${EXPR}" - OUTPUT_VARIABLE ${OUT} - RESULT_VARIABLE PYTHON_ERROR_CODE - ERROR_VARIABLE PYTHON_STDERR - OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(NOT PYTHON_ERROR_CODE EQUAL 0) - message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") - endif() -endmacro() - -# -# Update cmake's `CMAKE_PREFIX_PATH` with probable torch locations. -# - -# Run `EXPR` in python after importing `PKG`. Use the result of this to extend -# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. -macro (append_cmake_prefix_path PKG EXPR) - run_python(PREFIX_PATH - "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") - list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) -endmacro() - -# Add user site-packages path to `CMAKE_PREFIX_PATH`. -append_cmake_prefix_path("site" "site.getusersitepackages()") - -# Query torch for its install path and add to `CMAKE_PREFIX_PATH`. append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") # @@ -343,93 +289,6 @@ endif() # Define extension targets # -# -# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set -# of CUDA source files. The names of the corresponding "hipified" sources are -# stored in `OUT_SRCS`. -# -macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS) - # - # Split into C++ and non-C++ (i.e. CUDA) sources. - # - set(SRCS ${ORIG_SRCS}) - set(CXX_SRCS ${ORIG_SRCS}) - list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") - list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") - - # - # Generate ROCm/HIP source file names from CUDA file names. - # Since HIP files are generated code, they will appear in the build area - # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. - # - set(HIP_SRCS) - foreach (SRC ${SRCS}) - string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) - string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) - list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") - endforeach() - - set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) - add_custom_target( - hipify${NAME} - COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} - DEPENDS hipify.py ${SRCS} - BYPRODUCTS ${HIP_SRCS} - COMMENT "Running hipify on ${NAME} extension source files.") - - # Swap out original extension sources with hipified sources. - set(${OUT_SRCS} ${HIP_SRCS}) - list(APPEND ${OUT_SRCS} ${CXX_SRCS}) -endmacro() - -# -# Define a target named `MOD_NAME` for a single extension. The -# arguments are: -# -# MOD_GPU_LANG - the GPU language for this module, e.g CUDA, HIP, etc. -# MOD_SRC - the list of source files relative to CMakeLists.txt -# directory. -# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. -# MOD_GPU_ARCHES - a list of target GPU architectures in cmake format. -# Refer to documentation on `CMAKE_CUDA_ARCHITECTURES` -# and `CMAKE_HIP_ARCHITECTURES` for more info. -# -# Note: optimization level/debug info is set via cmake build type. -# -function(define_module_target MOD_NAME MOD_GPU_LANG MOD_SRC MOD_EXTRA_GPU_FLAGS - MOD_GPU_ARCHES) - - # Add hipify preprocessing step when building with HIP/ROCm. - if (MOD_GPU_LANG STREQUAL "HIP") - hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") - endif() - - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - - if (MOD_GPU_LANG STREQUAL "HIP") - # Make this target dependent on the hipify preprocessor step. - add_dependencies(${MOD_NAME} hipify${MOD_NAME}) - endif() - - set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES - "${MOD_GPU_ARCHES}") - - set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) - - target_compile_options(${MOD_NAME} PRIVATE - $<$:${MOD_EXTRA_GPU_FLAGS}>) - - target_compile_definitions(${MOD_NAME} PRIVATE - "-DTORCH_EXTENSION_NAME=${MOD_NAME}") - - target_include_directories(${MOD_NAME} PRIVATE - csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS}) - - target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) - - install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm) -endfunction() - # # _C extension # @@ -452,7 +311,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/custom_all_reduce.cu") endif() -define_module_target(_C +define_gpu_extension_target( + _C + vllm "${VLLM_GPU_LANG}" "${VLLM_EXT_SRC}" "${VLLM_GPU_FLAGS}" @@ -466,7 +327,9 @@ set(VLLM_MOE_EXT_SRC "csrc/moe/moe_ops.cpp" "csrc/moe/topk_softmax_kernels.cu") -define_module_target(_moe_C +define_gpu_extension_target( + _moe_C + vllm "${VLLM_GPU_LANG}" "${VLLM_MOE_EXT_SRC}" "${VLLM_GPU_FLAGS}" @@ -497,7 +360,9 @@ set(VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu" "csrc/punica/punica_ops.cc") -define_module_target(_punica_C +define_gpu_extension_target( + _punica_C + vllm "${VLLM_GPU_LANG}" "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_GPU_FLAGS}" diff --git a/setup.py b/setup.py index e6f1ada774a89..81e44dca9fb18 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ import os import re import subprocess +import sys from typing import List from packaging.version import parse, Version @@ -121,6 +122,12 @@ def build_extensions(self): build_tool = ['-G', 'Unix Makefiles'] build_jobs = ['-j', str(num_jobs)] + # Pass the python executable to cmake so it can find an exact + # match. + cmake_args += [ + '-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable) + ] + # Config if not cmake_build_ext.did_config: cmake_build_ext.did_config = True diff --git a/utils.cmake b/utils.cmake new file mode 100644 index 0000000000000..6b782e6f96d4c --- /dev/null +++ b/utils.cmake @@ -0,0 +1,135 @@ +# +# Attempt to find the python pacakge that uses the same python executable as +# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. +# +macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) + file(REAL_PATH ${EXECUTABLE} EXECUTABLE) + set(Python_EXECUTABLE ${EXECUTABLE}) + find_package(Python COMPONENTS Interpreter Development.Module) + if (NOT Python_FOUND) + message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") + endif() + set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") + set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) + if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST) + message(FATAL_ERROR + "Python version (${VER}) is not one of the supported versions: " + "${SUPPORTED_VERSIONS}.") + endif() + message(STATUS "Found python matching: ${EXECUTABLE}.") +endmacro() + +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, a fatal message `ERR_MSG` is issued. +# +macro (run_python OUT EXPR ERR_MSG) + execute_process( + COMMAND + "${Python_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE ${OUT} + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") + endif() +endmacro() + +# Run `EXPR` in python after importing `PKG`. Use the result of this to extend +# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. +macro (append_cmake_prefix_path PKG EXPR) + run_python(PREFIX_PATH + "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") + list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) +endmacro() + +# +# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set +# of CUDA source files. The names of the corresponding "hipified" sources are +# stored in `OUT_SRCS`. +# +macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) + # + # Split into C++ and non-C++ (i.e. CUDA) sources. + # + set(SRCS ${ORIG_SRCS}) + set(CXX_SRCS ${ORIG_SRCS}) + list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") + list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") + + # + # Generate ROCm/HIP source file names from CUDA file names. + # Since HIP files are generated code, they will appear in the build area + # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. + # + set(HIP_SRCS) + foreach (SRC ${SRCS}) + string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) + string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) + list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") + endforeach() + + set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) + add_custom_target( + hipify${NAME} + COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} + DEPENDS hipify.py ${SRCS} + BYPRODUCTS ${HIP_SRCS} + COMMENT "Running hipify on ${NAME} extension source files.") + + # Swap out original extension sources with hipified sources. + set(${OUT_SRCS} ${HIP_SRCS}) + list(APPEND ${OUT_SRCS} ${CXX_SRCS}) +endmacro() + +# +# Define a target named `MOD_NAME` for a single extension. The +# arguments are: +# +# MOD_DEST - module destination directory. +# MOD_GPU_LANG - the GPU language for this module, e.g CUDA, HIP, etc. +# MOD_SRC - the list of source files relative to CMakeLists.txt +# directory. +# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. +# MOD_GPU_ARCHES - a list of target GPU architectures in cmake format. +# Refer to documentation on `CMAKE_CUDA_ARCHITECTURES` +# and `CMAKE_HIP_ARCHITECTURES` for more info. +# +# Note: optimization level/debug info is set via cmake build type. +# +function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC + MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) + + # Add hipify preprocessing step when building with HIP/ROCm. + if (MOD_GPU_LANG STREQUAL "HIP") + hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") + endif() + + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + + if (MOD_GPU_LANG STREQUAL "HIP") + # Make this target dependent on the hipify preprocessor step. + add_dependencies(${MOD_NAME} hipify${MOD_NAME}) + endif() + + set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES + "${MOD_GPU_ARCHES}") + + set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) + + target_compile_options(${MOD_NAME} PRIVATE + $<$:${MOD_EXTRA_GPU_FLAGS}>) + + target_compile_definitions(${MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${MOD_NAME}") + + target_include_directories(${MOD_NAME} PRIVATE + csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS}) + + target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + + install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST}) +endfunction() From 4c9f6b0f8efc584c1a7756c50486dca8894fa9ac Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 27 Feb 2024 20:15:24 +0000 Subject: [PATCH 51/76] fix typo --- utils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.cmake b/utils.cmake index 6b782e6f96d4c..800d371328e9f 100644 --- a/utils.cmake +++ b/utils.cmake @@ -1,5 +1,5 @@ # -# Attempt to find the python pacakge that uses the same python executable as +# Attempt to find the python package that uses the same python executable as # `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. # macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) From 96d86ccd8ee753accfc6e04b4a348efd28a64505 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 27 Feb 2024 20:36:12 +0000 Subject: [PATCH 52/76] add path to include of utils.cmake --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e2106d1162f7c..adcc56c8ce5d8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -include(utils.cmake) +include(${CMAKE_SOURCE_DIR}/utils.cmake) # # Supported python versions. These versions will be searched in order, the From fcbd89f508ee8491d5ac5e7c1283701b08e49b4b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 27 Feb 2024 20:41:08 +0000 Subject: [PATCH 53/76] try another path for utils.cmake --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index adcc56c8ce5d8..64f53d8938d6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -include(${CMAKE_SOURCE_DIR}/utils.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) # # Supported python versions. These versions will be searched in order, the From 2f0ed6d8728f8b9849aebbe9d855186fdb44153b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 27 Feb 2024 20:53:04 +0000 Subject: [PATCH 54/76] add utils.cmake to Dockerfile + MANIFEST.in --- Dockerfile | 1 + MANIFEST.in | 1 + 2 files changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 97e629dc07abb..f9f6048dfe3d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -40,6 +40,7 @@ COPY csrc csrc COPY setup.py setup.py COPY hipify.py hipify.py COPY CMakeLists.txt CMakeLists.txt +COPY utils.cmake utils.cmake COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml COPY vllm/__init__.py vllm/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in index 38c9e58b4e73e..25087882bec72 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,6 @@ include LICENSE include requirements.txt include CMakeLists.txt +include utils.cmake recursive-include csrc * From 086de5c09012acc68c383964fdeaef405211cb9e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Tue, 27 Feb 2024 21:41:47 +0000 Subject: [PATCH 55/76] remove mpi include directories --- utils.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils.cmake b/utils.cmake index 800d371328e9f..f2419bdb24a73 100644 --- a/utils.cmake +++ b/utils.cmake @@ -126,8 +126,7 @@ function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}") - target_include_directories(${MOD_NAME} PRIVATE - csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS}) + target_include_directories(${MOD_NAME} PRIVATE csrc) target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) From 0ff8825a28136daba6cc4731a91e7182c18099b7 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 18:34:27 +0000 Subject: [PATCH 56/76] refactor setup.py so cmake configuration is separate from cmake build --- setup.py | 178 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 98 insertions(+), 80 deletions(-) diff --git a/setup.py b/setup.py index 81e44dca9fb18..ddb59a1e85dc9 100644 --- a/setup.py +++ b/setup.py @@ -47,8 +47,93 @@ def __init__(self, name, cmake_lists_dir='.', **kwa): class cmake_build_ext(build_ext): - # A flag to ensure that the cmake config step only runs once. - did_config = False + # A dict of extension directories that have been configured. + did_config = {} + + # + # Determine number of compilation jobs and optionally nvcc compile threads. + # + def compute_num_jobs(self): + try: + num_jobs = len(os.sched_getaffinity(0)) + except AttributeError: + num_jobs = os.cpu_count() + + nvcc_cuda_version = get_nvcc_cuda_version() + if nvcc_cuda_version >= Version("11.2"): + nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) + num_jobs = max(1, round(num_jobs / (nvcc_threads / 4))) + else: + nvcc_threads = None + + return num_jobs, nvcc_threads + + # + # Perform cmake configuration for a single extension. + # + def configure(self, ext): + # If we've already configured using the CMakeLists.txt for + # this extension, exit early. + if ext.cmake_lists_dir in cmake_build_ext.did_config: + return + + cmake_build_ext.did_config[ext.cmake_lists_dir] = True + + # Select the build type. + # Note: optimization level + debug info are set by the build type + cfg = os.getenv("CMAKE_BUILD_TYPE", "RelWithDebInfo") + + # where .so files will be written, should be the same for all extensions + # that use the same CMakeLists.txt. + outdir = os.path.abspath( + os.path.dirname(self.get_ext_fullpath(ext.name))) + + cmake_args = [ + '-DCMAKE_BUILD_TYPE={}'.format(cfg), + '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir), + '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp), + ] + + verbose = bool(int(os.getenv('VERBOSE', '0'))) + if verbose: + cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] + + if is_sccache_available(): + cmake_args += [ + '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', + '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', + ] + elif is_ccache_available(): + cmake_args += [ + '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', + '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', + ] + + # Pass the python executable to cmake so it can find an exact + # match. + cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)] + + # + # Setup parallelism and build tool + # + num_jobs, nvcc_threads = self.compute_num_jobs() + + if nvcc_threads: + cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)] + + if is_ninja_available(): + build_tool = ['-G', 'Ninja'] + cmake_args += [ + '-DCMAKE_JOB_POOL_COMPILE:STRING=compile', + '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs), + ] + else: + # Default build tool to whatever cmake picks. + build_tool = [] + + subprocess.check_call( + ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], + cwd=self.build_temp) def build_extensions(self): # Ensure that CMake is present and working @@ -57,90 +142,23 @@ def build_extensions(self): except OSError as e: raise RuntimeError('Cannot find CMake executable') from e - for ext in self.extensions: - - extdir = os.path.abspath( - os.path.dirname(self.get_ext_fullpath(ext.name))) - - # Note: optimization level + debug info set by the build type - cfg = os.getenv("VLLM_BUILD_TYPE", "RelWithDebInfo") - - cmake_args = [ - '-DCMAKE_BUILD_TYPE=%s' % cfg, - # Ask CMake to place the resulting library in the directory - # containing the extension - '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format( - cfg.upper(), extdir), - # Other intermediate static libraries are placed in a - # temporary build directory instead - '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format( - cfg.upper(), self.build_temp), - ] + # Create build directory if it does not exist. + if not os.path.exists(self.build_temp): + os.makedirs(self.build_temp) - verbose = bool(int(os.getenv('VERBOSE', '0'))) - if verbose: - cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON'] - - if is_sccache_available(): - cmake_args += [ - '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache', - '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache', - ] - elif is_ccache_available(): - cmake_args += [ - '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache', - '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache', - ] - - # - # Setup parallelism - # - try: - num_jobs = len(os.sched_getaffinity(0)) - except AttributeError: - num_jobs = os.cpu_count() - - nvcc_cuda_version = get_nvcc_cuda_version() - if nvcc_cuda_version >= Version("11.2"): - nvcc_threads = int(os.getenv("NVCC_THREADS", 8)) - num_jobs = max(1, round(num_jobs / (nvcc_threads / 4))) - cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)] - - if not os.path.exists(self.build_temp): - os.makedirs(self.build_temp) + # Build all the extensions + for ext in self.extensions: + self.configure(ext) ext_target_name = remove_prefix(ext.name, "vllm.") + num_jobs, _ = self.compute_num_jobs() - if is_ninja_available(): - build_tool = ['-G', 'Ninja'] - cmake_args += [ - '-DCMAKE_JOB_POOL_COMPILE:STRING=compile', - '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs), - ] - build_jobs = [] - else: - build_tool = ['-G', 'Unix Makefiles'] - build_jobs = ['-j', str(num_jobs)] - - # Pass the python executable to cmake so it can find an exact - # match. - cmake_args += [ - '-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable) - ] - - # Config - if not cmake_build_ext.did_config: - cmake_build_ext.did_config = True - subprocess.check_call( - ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], - cwd=self.build_temp) - - # Build build_args = [ - '--build', '.', '--config', cfg, '--target', ext_target_name + '--build', '.', '--target', ext_target_name, '-j', + str(num_jobs) ] - subprocess.check_call(['cmake', *build_args, *build_jobs], - cwd=self.build_temp) + + subprocess.check_call(['cmake', *build_args], cwd=self.build_temp) def _is_cuda() -> bool: From f625d6d9d4027b3ca3b22e27cc76f187829b05fa Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 19:00:22 +0000 Subject: [PATCH 57/76] let --debug control build type if CMAKE_BUILD_TYPE is not set --- CMakeLists.txt | 2 ++ setup.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 64f53d8938d6f..a490eb458b37e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) +message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") + include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) # diff --git a/setup.py b/setup.py index ddb59a1e85dc9..fad642717dcdc 100644 --- a/setup.py +++ b/setup.py @@ -81,7 +81,8 @@ def configure(self, ext): # Select the build type. # Note: optimization level + debug info are set by the build type - cfg = os.getenv("CMAKE_BUILD_TYPE", "RelWithDebInfo") + default_cfg = "Debug" if self.debug else "RelWithDebInfo" + cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg) # where .so files will be written, should be the same for all extensions # that use the same CMakeLists.txt. From efda6fb825dd9ea6d0212e891f9751138c3f8a1b Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 19:07:32 +0000 Subject: [PATCH 58/76] add some type annotations to setup.py --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index fad642717dcdc..41dd244f085c4 100644 --- a/setup.py +++ b/setup.py @@ -41,8 +41,8 @@ def remove_prefix(text, prefix): class CMakeExtension(Extension): - def __init__(self, name, cmake_lists_dir='.', **kwa): - Extension.__init__(self, name, sources=[], **kwa) + def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: + super().__init__(name, sources=[], **kwa) self.cmake_lists_dir = os.path.abspath(cmake_lists_dir) @@ -71,7 +71,7 @@ def compute_num_jobs(self): # # Perform cmake configuration for a single extension. # - def configure(self, ext): + def configure(self, ext: CMakeExtension) -> None: # If we've already configured using the CMakeLists.txt for # this extension, exit early. if ext.cmake_lists_dir in cmake_build_ext.did_config: @@ -136,7 +136,7 @@ def configure(self, ext): ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args], cwd=self.build_temp) - def build_extensions(self): + def build_extensions(self) -> None: # Ensure that CMake is present and working try: subprocess.check_output(['cmake', '--version']) From 3fc98a60c3ecb884eb4b0045684aee0dfbc25aa8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 19:24:45 +0000 Subject: [PATCH 59/76] add comment --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 41dd244f085c4..9f24588be48a3 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,8 @@ class cmake_build_ext(build_ext): # def compute_num_jobs(self): try: + # os.sched_getaffinity() isn't univerally available, so fall back + # to os.cpu_count() if we get an error here. num_jobs = len(os.sched_getaffinity(0)) except AttributeError: num_jobs = os.cpu_count() From a68fec9d6e5ed3ad37ca360a0e1b03e17f8772c1 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 20:54:53 +0000 Subject: [PATCH 60/76] review comments + move most of the arch/compiler flag code to utils.cmake --- CMakeLists.txt | 246 ++++++---------------------- Dockerfile | 3 +- MANIFEST.in | 2 +- hipify.py => cmake/hipify.py | 0 cmake/utils.cmake | 308 +++++++++++++++++++++++++++++++++++ utils.cmake | 134 --------------- 6 files changed, 362 insertions(+), 331 deletions(-) rename hipify.py => cmake/hipify.py (100%) create mode 100644 cmake/utils.cmake delete mode 100644 utils.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index a490eb458b37e..0b1bc113b97a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(vllm_extensions LANGUAGES CXX) message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") -include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) # # Supported python versions. These versions will be searched in order, the @@ -13,10 +13,10 @@ include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake) set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11") # Supported NVIDIA architectures. -set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") +set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") # Supported AMD GPU architectures. -set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") +set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100") # # Supported/expected torch versions for CUDA/ROCm. @@ -40,8 +40,8 @@ if (VLLM_PYTHON_EXECUTABLE) find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}") else() message(FATAL_ERROR - "Please set VLLM_PYTHON_EXECUTABLE to the desired python version before " - "running cmake configure.") + "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version" + " before running cmake configure.") endif() # @@ -76,6 +76,11 @@ if (NOT HIP_FOUND AND CUDA_FOUND) elseif(HIP_FOUND) set(VLLM_GPU_LANG "HIP") + # Importing torch recognizes and sets up some HIP/ROCm configuration but does + # not let cmake recognize .hip files. In order to get cmake to understand the + # .hip extension automatically, HIP must be enabled explicitly. + enable_language(HIP) + # ROCm 5.x if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X}) @@ -94,197 +99,26 @@ else() endif() # -# Setup extra platform specific GPU compilation flags, e.g. NVCC flags for CUDA -# and hip flags for ROCm. -# -# Note: CUDA (and partially HIP) is detected by pytorch package so there's no -# need to repeat detecting it explicitly with check_language, etc. +# Override the GPU architectures detected by cmake/torch and filter them by +# the supported versions for the current language. +# The final set of arches is stored in `VLLM_GPU_ARCHES`. # -if (HIP_FOUND) - # Importing torch recognizes and sets up some HIP/ROCm configuration but does - # not let cmake recognize .hip files. In order to get cmake to understand the - # .hip extension automatically, HIP must be enabled explicitly. - enable_language(HIP) - - # - # `VLLM_HIP_ARCHITECUTRES` controls the `--offload-arch` flags. - # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled - # via the `PYTORCH_ROCM_ARCH` env variable. - # - - # - # Find the intersection of the supported + detected architectures to - # set the module architecture flags. - # - set(VLLM_HIP_ARCHITECTURES) - foreach (ARCH ${CMAKE_HIP_ARCHITECTURES}) - if (ARCH IN_LIST ROCM_SUPPORTED_ARCHS) - list(APPEND VLLM_HIP_ARCHITECTURES ${ARCH}) - endif() - endforeach() - - if(NOT VLLM_HIP_ARCHITECTURES) - message(FATAL_ERROR - "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" - " supported. Supported ROCm architectures are: ${ROCM_SUPPORTED_ARCHS}.") - endif() - - set(VLLM_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES}) - set(VLLM_PUNICA_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES}) - - # Get common HIP/HIPCC flags from torch. - run_python(VLLM_GPU_FLAGS - "from torch.utils.cpp_extension import COMMON_HIP_FLAGS; print(';'.join(COMMON_HIP_FLAGS))" - "Failed to determine torch nvcc compiler flags") - - run_python(VLLM_HIPCC_FLAGS - "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))" - "Failed to determine torch nvcc compiler flags") - - list(APPEND VLLM_GPU_FLAGS - ${VLLM_HIPCC_FLAGS} - "-DUSE_ROCM" - "-U__HIP_NO_HALF_CONVERSIONS__" - "-U__HIP_NO_HALF_OPERATORS__" - "-fno-gpu-rdc") - -else() - # - # Get common NVCC flags from torch. - # - run_python(VLLM_GPU_FLAGS - "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" - "Failed to determine torch nvcc compiler flags") - - if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) - list(APPEND VLLM_GPU_FLAGS "-DENABLE_FP8_E5M2") - endif() - - if(NVCC_THREADS) - list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") - endif() - - set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS}) - - # - # Copy flags+update for punica - # - list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS - "-D__CUDA_NO_HALF_OPERATORS__" - "-D__CUDA_NO_HALF_CONVERSIONS__" - "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" - "-D__CUDA_NO_HALF2_OPERATORS__") - - # - # Setup/process CUDA arch flags. - # - # The torch cmake setup hardcodes the detected architecture flags in - # `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it - # can't modified on a per-target basis, e.g. for the `punica` extension. - # So, all the `-gencode` flags need to be extracted and removed from - # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method. - # Since it's not possible to use `target_compiler_options` for adding target - # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property - # must be used instead. This requires repackaging the architecture flags - # into a format that cmake expects for `CUDA_ARCHITECTURES`. - # - # This is a bit fragile in that it depends on torch using `-gencode` as opposed - # to one of the other nvcc options to specify architectures. - # - # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override - # detected architectures. - # - message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - - # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS` - string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS - ${CMAKE_CUDA_FLAGS}) - - # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified - # and passed back via the `CUDA_ARCHITECTURES` property. - string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS - ${CMAKE_CUDA_FLAGS}) - - # If this error is triggered, it might mean that torch has changed how it sets - # up nvcc architecture code generation flags. - if (NOT VLLM_CUDA_ARCH_FLAGS) - message(FATAL_ERROR - "Could not find any architecture related code generation flags in " - "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})") - endif() - - message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}") - - # Macro for converting a `gencode` version number to a cmake version number. - macro(string_to_ver OUT_VER IN_STR) - string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) - endmacro() - - # Initialize the architecture lists to empty. - set(VLLM_GPU_ARCHES) - set(VLLM_PUNICA_GPU_ARCHES) - - # Process each `gencode` flag. - foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS}) - # For each flag, extract the version number and whether it refers to PTX - # or native code. - # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding - # for that match. - - string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) - if (COMPUTE) - set(COMPUTE ${CMAKE_MATCH_1}) - endif() - - string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH}) - if (SM) - set(SM ${CMAKE_MATCH_1}) - endif() - - string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH}) - if (CODE) - set(CODE ${CMAKE_MATCH_1}) - endif() - - # Make sure the virtual architecture can be matched. - if (NOT COMPUTE) - message(FATAL_ERROR - "Could not determine virtual architecture from: ${ARCH}.") - endif() - - # One of sm_ or compute_ must exist. - if ((NOT SM) AND (NOT CODE)) - message(FATAL_ERROR - "Could not determine a codegen architecture from: ${ARCH}.") - endif() - - if (SM) - set(VIRT "") - set(CODE_ARCH ${SM}) - else() - set(VIRT "-virtual") - set(CODE_ARCH ${CODE}) - endif() - - # Check if the current version is in the supported arch list. - string_to_ver(CODE_VER ${CODE_ARCH}) - if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS) - message(STATUS "discarding unsupported CUDA arch ${VER}.") - continue() - endif() - - # Add it to the arch list. - list(APPEND VLLM_GPU_ARCHES "${CODE_ARCH}${VIRT}") +override_gpu_arches(VLLM_GPU_ARCHES + ${VLLM_GPU_LANG} + "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}") - # Add it to punica arch list if the version is >= 8.0. - if (CODE_VER GREATER_EQUAL 8.0) - list(APPEND VLLM_PUNICA_GPU_ARCHES "${CODE_ARCH}${VIRT}") - endif() - endforeach() +# +# Query torch for additional GPU compilation flags for the given +# `VLLM_GPU_LANG`. +# The final set of arches is stored in `VLLM_GPU_FLAGS`. +# +get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG}) - message(DEBUG "nvcc arch: ${VLLM_GPU_ARCHES}") - message(DEBUG "punica arch: ${VLLM_PUNICA_GPU_ARCHES}") +# +# Set nvcc parallelism. +# +if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") + list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") endif() # @@ -362,6 +196,30 @@ set(VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu" "csrc/punica/punica_ops.cc") +# +# Copy GPU compilation flags+update for punica +# +set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS}) +list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS + "-D__CUDA_NO_HALF_OPERATORS__" + "-D__CUDA_NO_HALF_CONVERSIONS__" + "-D__CUDA_NO_BFLOAT16_CONVERSIONS__" + "-D__CUDA_NO_HALF2_OPERATORS__") + +# +# Filter out CUDA architectures < 8.0 for punica. +# +if (${VLLM_GPU_LANG} STREQUAL "CUDA") + set(VLLM_PUNICA_GPU_ARCHES) + foreach(ARCH ${VLLM_GPU_ARCHES}) + string_to_ver(CODE_VER ${ARCH}) + if (CODE_VER GREATER_EQUAL 8.0) + list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH}) + endif() + endforeach() + message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}") +endif() + define_gpu_extension_target( _punica_C vllm @@ -380,7 +238,7 @@ define_gpu_extension_target( # of which extensions are supported has been factored in, e.g. # # mkdir build && cd build -# cmake -G Ninja .. +# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm .. # cmake --build . --target default # add_custom_target(default) diff --git a/Dockerfile b/Dockerfile index f9f6048dfe3d9..6a56a33cfe7ac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,9 +38,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # copy input files COPY csrc csrc COPY setup.py setup.py -COPY hipify.py hipify.py +COPY cmake cmake COPY CMakeLists.txt CMakeLists.txt -COPY utils.cmake utils.cmake COPY requirements.txt requirements.txt COPY pyproject.toml pyproject.toml COPY vllm/__init__.py vllm/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in index 25087882bec72..aa16da6500e6c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ include LICENSE include requirements.txt include CMakeLists.txt -include utils.cmake +recursive-include cmake * recursive-include csrc * diff --git a/hipify.py b/cmake/hipify.py similarity index 100% rename from hipify.py rename to cmake/hipify.py diff --git a/cmake/utils.cmake b/cmake/utils.cmake new file mode 100644 index 0000000000000..f0a73316427b7 --- /dev/null +++ b/cmake/utils.cmake @@ -0,0 +1,308 @@ +# +# Attempt to find the python package that uses the same python executable as +# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. +# +macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) + file(REAL_PATH ${EXECUTABLE} EXECUTABLE) + set(Python_EXECUTABLE ${EXECUTABLE}) + find_package(Python COMPONENTS Interpreter Development.Module) + if (NOT Python_FOUND) + message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") + endif() + set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") + set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) + if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST) + message(FATAL_ERROR + "Python version (${VER}) is not one of the supported versions: " + "${SUPPORTED_VERSIONS}.") + endif() + message(STATUS "Found python matching: ${EXECUTABLE}.") +endmacro() + +# +# Run `EXPR` in python. The standard output of python is stored in `OUT` and +# has trailing whitespace stripped. If an error is encountered when running +# python, a fatal message `ERR_MSG` is issued. +# +macro (run_python OUT EXPR ERR_MSG) + execute_process( + COMMAND + "${Python_EXECUTABLE}" "-c" "${EXPR}" + OUTPUT_VARIABLE ${OUT} + RESULT_VARIABLE PYTHON_ERROR_CODE + ERROR_VARIABLE PYTHON_STDERR + OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(NOT PYTHON_ERROR_CODE EQUAL 0) + message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") + endif() +endmacro() + +# Run `EXPR` in python after importing `PKG`. Use the result of this to extend +# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. +macro (append_cmake_prefix_path PKG EXPR) + run_python(PREFIX_PATH + "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") + list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) +endmacro() + +# +# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set +# of CUDA source files. The names of the corresponding "hipified" sources are +# stored in `OUT_SRCS`. +# +macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) + # + # Split into C++ and non-C++ (i.e. CUDA) sources. + # + set(SRCS ${ORIG_SRCS}) + set(CXX_SRCS ${ORIG_SRCS}) + list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") + list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") + + # + # Generate ROCm/HIP source file names from CUDA file names. + # Since HIP files are generated code, they will appear in the build area + # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. + # + set(HIP_SRCS) + foreach (SRC ${SRCS}) + string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) + string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) + list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") + endforeach() + + set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) + add_custom_target( + hipify${NAME} + COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} + DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} + BYPRODUCTS ${HIP_SRCS} + COMMENT "Running hipify on ${NAME} extension source files.") + + # Swap out original extension sources with hipified sources. + set(${OUT_SRCS} ${HIP_SRCS}) + list(APPEND ${OUT_SRCS} ${CXX_SRCS}) +endmacro() + +# +# Get additional GPU compiler flags from torch. +# +macro(get_torch_gpu_compiler_flags GPU_FLAGS GPU_LANG) + if (${GPU_LANG} STREQUAL "CUDA") + # + # Get common NVCC flags from torch. + # + run_python(${GPU_FLAGS} + "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") + + if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) + list(APPEND ${GPU_FLAGS} "-DENABLE_FP8_E5M2") + endif() + + elseif(${GPU_LANG} STREQUAL "HIP") + # + # Get common HIP/HIPCC flags from torch. + # + run_python(${GPU_FLAGS} + "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))" + "Failed to determine torch nvcc compiler flags") + + list(APPEND ${GPU_FLAGS} + "-DUSE_ROCM" + "-U__HIP_NO_HALF_CONVERSIONS__" + "-U__HIP_NO_HALF_OPERATORS__" + "-fno-gpu-rdc") + + endif() +endmacro() + +# Macro for converting a `gencode` version number to a cmake version number. +macro(string_to_ver OUT_VER IN_STR) + string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR}) +endmacro() + +# +# Override the GPU architectures detected by cmake/torch and filter them by +# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in +# `GPU_ARCHES`. +# +macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) + set(GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) + message(STATUS "${GPU_LANG} supprted arches: ${GPU_SUPPORTED_ARCHES_LIST}") + + if (${GPU_LANG} STREQUAL "HIP") + # + # `GPU_ARCHES` controls the `--offload-arch` flags. + # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled + # via the `PYTORCH_ROCM_ARCH` env variable. + # + + # + # Find the intersection of the supported + detected architectures to + # set the module architecture flags. + # + set(${GPU_ARCHES}) + foreach (ARCH ${CMAKE_HIP_ARCHITECTURES}) + if (ARCH IN_LIST GPU_SUPPORTED_ARCHES_LIST) + list(APPEND ${GPU_ARCHES} ${ARCH}) + endif() + endforeach() + + if(NOT ${GPU_ARCHES}) + message(FATAL_ERROR + "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" + " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES_LIST}.") + endif() + + elseif(${GPU_LANG} STREQUAL "CUDA") + # + # Setup/process CUDA arch flags. + # + # The torch cmake setup hardcodes the detected architecture flags in + # `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it + # can't modified on a per-target basis, e.g. for the `punica` extension. + # So, all the `-gencode` flags need to be extracted and removed from + # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method. + # Since it's not possible to use `target_compiler_options` for adding target + # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property + # must be used instead. This requires repackaging the architecture flags + # into a format that cmake expects for `CUDA_ARCHITECTURES`. + # + # This is a bit fragile in that it depends on torch using `-gencode` as opposed + # to one of the other nvcc options to specify architectures. + # + # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override + # detected architectures. + # + message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") + + # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS` + string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS + ${CMAKE_CUDA_FLAGS}) + + # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified + # and passed back via the `CUDA_ARCHITECTURES` property. + string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS + ${CMAKE_CUDA_FLAGS}) + + # If this error is triggered, it might mean that torch has changed how it sets + # up nvcc architecture code generation flags. + if (NOT _CUDA_ARCH_FLAGS) + message(FATAL_ERROR + "Could not find any architecture related code generation flags in " + "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})") + endif() + + message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") + message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}") + + # Initialize the architecture lists to empty. + set(${GPU_ARCHES}) + + # Process each `gencode` flag. + foreach(ARCH ${_CUDA_ARCH_FLAGS}) + # For each flag, extract the version number and whether it refers to PTX + # or native code. + # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding + # for that match. + + string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) + if (COMPUTE) + set(COMPUTE ${CMAKE_MATCH_1}) + endif() + + string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH}) + if (SM) + set(SM ${CMAKE_MATCH_1}) + endif() + + string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH}) + if (CODE) + set(CODE ${CMAKE_MATCH_1}) + endif() + + # Make sure the virtual architecture can be matched. + if (NOT COMPUTE) + message(FATAL_ERROR + "Could not determine virtual architecture from: ${ARCH}.") + endif() + + # One of sm_ or compute_ must exist. + if ((NOT SM) AND (NOT CODE)) + message(FATAL_ERROR + "Could not determine a codegen architecture from: ${ARCH}.") + endif() + + if (SM) + set(VIRT "") + set(CODE_ARCH ${SM}) + else() + set(VIRT "-virtual") + set(CODE_ARCH ${CODE}) + endif() + + # Check if the current version is in the supported arch list. + string_to_ver(CODE_VER ${CODE_ARCH}) + if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES_LIST) + message(STATUS "discarding unsupported CUDA arch ${VER}.") + continue() + endif() + + # Add it to the arch list. + list(APPEND ${GPU_ARCHES} "${CODE_ARCH}${VIRT}") + endforeach() + endif() + message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}") +endmacro() + +# +# Define a target named `MOD_NAME` for a single extension. The +# arguments are: +# +# MOD_DEST - module destination directory. +# MOD_GPU_LANG - the GPU language for this module, e.g CUDA, HIP, etc. +# MOD_SRC - the list of source files relative to CMakeLists.txt +# directory. +# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. +# MOD_GPU_ARCHES - a list of target GPU architectures in cmake format. +# Refer to documentation on `CMAKE_CUDA_ARCHITECTURES` +# and `CMAKE_HIP_ARCHITECTURES` for more info. +# +# Note: optimization level/debug info is set via cmake build type. +# +function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC + MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) + + # Add hipify preprocessing step when building with HIP/ROCm. + if (MOD_GPU_LANG STREQUAL "HIP") + hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") + endif() + + Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + + if (MOD_GPU_LANG STREQUAL "HIP") + # Make this target dependent on the hipify preprocessor step. + add_dependencies(${MOD_NAME} hipify${MOD_NAME}) + endif() + + if (MOD_GPU_ARCHES) + set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES + "${MOD_GPU_ARCHES}") + endif() + + set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) + + target_compile_options(${MOD_NAME} PRIVATE + $<$:${MOD_EXTRA_GPU_FLAGS}>) + + target_compile_definitions(${MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${MOD_NAME}") + + target_include_directories(${MOD_NAME} PRIVATE csrc) + + target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + + install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST}) +endfunction() diff --git a/utils.cmake b/utils.cmake deleted file mode 100644 index f2419bdb24a73..0000000000000 --- a/utils.cmake +++ /dev/null @@ -1,134 +0,0 @@ -# -# Attempt to find the python package that uses the same python executable as -# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`. -# -macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) - file(REAL_PATH ${EXECUTABLE} EXECUTABLE) - set(Python_EXECUTABLE ${EXECUTABLE}) - find_package(Python COMPONENTS Interpreter Development.Module) - if (NOT Python_FOUND) - message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") - endif() - set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") - set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) - if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST) - message(FATAL_ERROR - "Python version (${VER}) is not one of the supported versions: " - "${SUPPORTED_VERSIONS}.") - endif() - message(STATUS "Found python matching: ${EXECUTABLE}.") -endmacro() - -# -# Run `EXPR` in python. The standard output of python is stored in `OUT` and -# has trailing whitespace stripped. If an error is encountered when running -# python, a fatal message `ERR_MSG` is issued. -# -macro (run_python OUT EXPR ERR_MSG) - execute_process( - COMMAND - "${Python_EXECUTABLE}" "-c" "${EXPR}" - OUTPUT_VARIABLE ${OUT} - RESULT_VARIABLE PYTHON_ERROR_CODE - ERROR_VARIABLE PYTHON_STDERR - OUTPUT_STRIP_TRAILING_WHITESPACE) - - if(NOT PYTHON_ERROR_CODE EQUAL 0) - message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") - endif() -endmacro() - -# Run `EXPR` in python after importing `PKG`. Use the result of this to extend -# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. -macro (append_cmake_prefix_path PKG EXPR) - run_python(PREFIX_PATH - "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") - list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) -endmacro() - -# -# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set -# of CUDA source files. The names of the corresponding "hipified" sources are -# stored in `OUT_SRCS`. -# -macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) - # - # Split into C++ and non-C++ (i.e. CUDA) sources. - # - set(SRCS ${ORIG_SRCS}) - set(CXX_SRCS ${ORIG_SRCS}) - list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$") - list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$") - - # - # Generate ROCm/HIP source file names from CUDA file names. - # Since HIP files are generated code, they will appear in the build area - # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir. - # - set(HIP_SRCS) - foreach (SRC ${SRCS}) - string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC}) - string(REGEX REPLACE "cuda" "hip" SRC ${SRC}) - list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}") - endforeach() - - set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) - add_custom_target( - hipify${NAME} - COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} - DEPENDS hipify.py ${SRCS} - BYPRODUCTS ${HIP_SRCS} - COMMENT "Running hipify on ${NAME} extension source files.") - - # Swap out original extension sources with hipified sources. - set(${OUT_SRCS} ${HIP_SRCS}) - list(APPEND ${OUT_SRCS} ${CXX_SRCS}) -endmacro() - -# -# Define a target named `MOD_NAME` for a single extension. The -# arguments are: -# -# MOD_DEST - module destination directory. -# MOD_GPU_LANG - the GPU language for this module, e.g CUDA, HIP, etc. -# MOD_SRC - the list of source files relative to CMakeLists.txt -# directory. -# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. -# MOD_GPU_ARCHES - a list of target GPU architectures in cmake format. -# Refer to documentation on `CMAKE_CUDA_ARCHITECTURES` -# and `CMAKE_HIP_ARCHITECTURES` for more info. -# -# Note: optimization level/debug info is set via cmake build type. -# -function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC - MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) - - # Add hipify preprocessing step when building with HIP/ROCm. - if (MOD_GPU_LANG STREQUAL "HIP") - hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") - endif() - - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) - - if (MOD_GPU_LANG STREQUAL "HIP") - # Make this target dependent on the hipify preprocessor step. - add_dependencies(${MOD_NAME} hipify${MOD_NAME}) - endif() - - set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES - "${MOD_GPU_ARCHES}") - - set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) - - target_compile_options(${MOD_NAME} PRIVATE - $<$:${MOD_EXTRA_GPU_FLAGS}>) - - target_compile_definitions(${MOD_NAME} PRIVATE - "-DTORCH_EXTENSION_NAME=${MOD_NAME}") - - target_include_directories(${MOD_NAME} PRIVATE csrc) - - target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) - - install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST}) -endfunction() From 094c448613595ae40fd52246fb0a6401b37539f8 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 20:55:20 +0000 Subject: [PATCH 61/76] typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9f24588be48a3..69c909ea92817 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ class cmake_build_ext(build_ext): # def compute_num_jobs(self): try: - # os.sched_getaffinity() isn't univerally available, so fall back + # os.sched_getaffinity() isn't universally available, so fall back # to os.cpu_count() if we get an error here. num_jobs = len(os.sched_getaffinity(0)) except AttributeError: From d461230ec43185b922461c87553280354f25ae82 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 20:58:34 +0000 Subject: [PATCH 62/76] utils.cmake typo --- cmake/utils.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f0a73316427b7..eb43502f13e0e 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -130,7 +130,7 @@ endmacro() # macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) set(GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) - message(STATUS "${GPU_LANG} supprted arches: ${GPU_SUPPORTED_ARCHES_LIST}") + message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES_LIST}") if (${GPU_LANG} STREQUAL "HIP") # From 8ff011652dfc451807d65ead6ff8fe9d002caaa9 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 28 Feb 2024 22:39:35 +0000 Subject: [PATCH 63/76] more detailed comment for libtorch_python.so --- CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b1bc113b97a9..ea873c65e0010 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,9 +57,16 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path") # find_package(Torch REQUIRED) -# For some reason torch does not add libtorch_python.so to the list of torch -# libraries to link. Find it by hand using `append_torchlib_if_found` from +# +# Normally `torch.utils.cpp_extension.CUDAExtension` would add +# `libtorch_python.so` for linking against an extension. Torch's cmake +# configuration does not include this library (presumably since the cmake +# config is used for standalone C++ binaries that link against torch). +# The `libtorch_python.so` library defines some of the glue code between +# torch/python via pybind and is required by VLLM extensions for this +# reason. So, add it by manually using `append_torchlib_if_found` from # torch's cmake setup. +# append_torchlib_if_found(torch_python) # From 2e48dd7eb2ef7fdf8c4c576324476a207321a859 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 19:04:26 +0000 Subject: [PATCH 64/76] use cmake_parse_arguments for define_gpu_extension_target --- CMakeLists.txt | 33 ++++++++++++---------- cmake/utils.cmake | 71 +++++++++++++++++++++++++++++------------------ 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea873c65e0010..225c1832ea31a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,11 +156,12 @@ endif() define_gpu_extension_target( _C - vllm - "${VLLM_GPU_LANG}" - "${VLLM_EXT_SRC}" - "${VLLM_GPU_FLAGS}" - "${VLLM_GPU_ARCHES}") + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_EXT_SRC} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + WITH_SOABI) # # _moe_C extension @@ -172,11 +173,12 @@ set(VLLM_MOE_EXT_SRC define_gpu_extension_target( _moe_C - vllm - "${VLLM_GPU_LANG}" - "${VLLM_MOE_EXT_SRC}" - "${VLLM_GPU_FLAGS}" - "${VLLM_GPU_ARCHES}") + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_MOE_EXT_SRC} + COMPILE_FLAGS ${VLLM_GPU_FLAGS} + ARCHITECTURES ${VLLM_GPU_ARCHES} + WITH_SOABI) # # _punica_C extension @@ -229,11 +231,12 @@ endif() define_gpu_extension_target( _punica_C - vllm - "${VLLM_GPU_LANG}" - "${VLLM_PUNICA_EXT_SRC}" - "${VLLM_PUNICA_GPU_FLAGS}" - "${VLLM_PUNICA_GPU_ARCHES}") + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_PUNICA_EXT_SRC} + COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS} + ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES} + WITH_SOABI) # # Add the `default` target which detects which extensions should be diff --git a/cmake/utils.cmake b/cmake/utils.cmake index eb43502f13e0e..75fba84654233 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -258,51 +258,68 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) endmacro() # -# Define a target named `MOD_NAME` for a single extension. The +# Define a target named `GPU_MOD_NAME` for a single extension. The # arguments are: # -# MOD_DEST - module destination directory. -# MOD_GPU_LANG - the GPU language for this module, e.g CUDA, HIP, etc. -# MOD_SRC - the list of source files relative to CMakeLists.txt -# directory. -# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip. -# MOD_GPU_ARCHES - a list of target GPU architectures in cmake format. -# Refer to documentation on `CMAKE_CUDA_ARCHITECTURES` -# and `CMAKE_HIP_ARCHITECTURES` for more info. +# DESTINATION - module destination directory. +# LANGUAGE - the GPU language for this module, e.g CUDA, HIP, +# etc. +# SOURCES - list of source files relative to CMakeLists.txt +# directory. +# ARCHITECTURES - a list of target GPU architectures in cmake +# format. +# Refer `CMAKE_CUDA_ARCHITECTURES` documentation +# and `CMAKE_HIP_ARCHITECTURES` for more info. +# COMPILE_FLAGS - extra compiler flags passed to NVCC/hip. +# INCLUDE_DIRECTORIES - extra include directories. +# LINK_LIBRARIES - extra link libraries. +# WITH_SOABI - generate library with python SOABI suffix name. # # Note: optimization level/debug info is set via cmake build type. # -function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC - MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES) +function (define_gpu_extension_target GPU_MOD_NAME) + cmake_parse_arguments(PARSE_ARGV 1 + GPU + "WITH_SOABI" + "DESTINATION;LANGUAGE" + "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES") # Add hipify preprocessing step when building with HIP/ROCm. - if (MOD_GPU_LANG STREQUAL "HIP") - hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}") + if (GPU_LANGUAGE STREQUAL "HIP") + hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}") endif() - Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI) + if (GPU_WITH_SOABI) + set(GPU_WITH_SOABI WITH_SOABI) + else() + set(GPU_WITH_SOABI) + endif() + + Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI}) - if (MOD_GPU_LANG STREQUAL "HIP") + if (GPU_LANGUAGE STREQUAL "HIP") # Make this target dependent on the hipify preprocessor step. - add_dependencies(${MOD_NAME} hipify${MOD_NAME}) + add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME}) endif() - if (MOD_GPU_ARCHES) - set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES - "${MOD_GPU_ARCHES}") + if (GPU_ARCHITECTURES) + set_target_properties(${GPU_MOD_NAME} PROPERTIES + ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}") endif() - set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17) + set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17) - target_compile_options(${MOD_NAME} PRIVATE - $<$:${MOD_EXTRA_GPU_FLAGS}>) + target_compile_options(${GPU_MOD_NAME} PRIVATE + $<$:${GPU_COMPILE_FLAGS}>) - target_compile_definitions(${MOD_NAME} PRIVATE - "-DTORCH_EXTENSION_NAME=${MOD_NAME}") + target_compile_definitions(${GPU_MOD_NAME} PRIVATE + "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}") - target_include_directories(${MOD_NAME} PRIVATE csrc) + target_include_directories(${GPU_MOD_NAME} PRIVATE csrc + ${GPU_INCLUDE_DIRECTORIES}) - target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) + target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES} + ${GPU_LIBRARIES}) - install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST}) + install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION}) endfunction() From 5fc7a6acf7ef817be544d63ee9ea9fe75ac0704a Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 19:45:30 +0000 Subject: [PATCH 65/76] convert some macros to functions --- cmake/utils.cmake | 54 ++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 75fba84654233..8adf37cdfcb57 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -24,11 +24,11 @@ endmacro() # has trailing whitespace stripped. If an error is encountered when running # python, a fatal message `ERR_MSG` is issued. # -macro (run_python OUT EXPR ERR_MSG) +function (run_python OUT EXPR ERR_MSG) execute_process( COMMAND "${Python_EXECUTABLE}" "-c" "${EXPR}" - OUTPUT_VARIABLE ${OUT} + OUTPUT_VARIABLE PYTHON_OUT RESULT_VARIABLE PYTHON_ERROR_CODE ERROR_VARIABLE PYTHON_STDERR OUTPUT_STRIP_TRAILING_WHITESPACE) @@ -36,7 +36,8 @@ macro (run_python OUT EXPR ERR_MSG) if(NOT PYTHON_ERROR_CODE EQUAL 0) message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}") endif() -endmacro() + set(${OUT} ${PYTHON_OUT} PARENT_SCOPE) +endfunction() # Run `EXPR` in python after importing `PKG`. Use the result of this to extend # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. @@ -51,7 +52,7 @@ endmacro() # of CUDA source files. The names of the corresponding "hipified" sources are # stored in `OUT_SRCS`. # -macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) +function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) # # Split into C++ and non-C++ (i.e. CUDA) sources. # @@ -81,42 +82,43 @@ macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) COMMENT "Running hipify on ${NAME} extension source files.") # Swap out original extension sources with hipified sources. - set(${OUT_SRCS} ${HIP_SRCS}) - list(APPEND ${OUT_SRCS} ${CXX_SRCS}) -endmacro() + list(APPEND HIP_SRCS ${CXX_SRCS}) + set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE) +endfunction() # # Get additional GPU compiler flags from torch. # -macro(get_torch_gpu_compiler_flags GPU_FLAGS GPU_LANG) +function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) if (${GPU_LANG} STREQUAL "CUDA") # # Get common NVCC flags from torch. # - run_python(${GPU_FLAGS} + run_python(GPU_FLAGS "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))" "Failed to determine torch nvcc compiler flags") if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8) - list(APPEND ${GPU_FLAGS} "-DENABLE_FP8_E5M2") + list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2") endif() elseif(${GPU_LANG} STREQUAL "HIP") # # Get common HIP/HIPCC flags from torch. # - run_python(${GPU_FLAGS} + run_python(GPU_FLAGS "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))" "Failed to determine torch nvcc compiler flags") - list(APPEND ${GPU_FLAGS} + list(APPEND GPU_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__" "-fno-gpu-rdc") endif() -endmacro() + set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE) +endfunction() # Macro for converting a `gencode` version number to a cmake version number. macro(string_to_ver OUT_VER IN_STR) @@ -128,9 +130,8 @@ endmacro() # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in # `GPU_ARCHES`. # -macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) - set(GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) - message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES_LIST}") +function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) + message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES}") if (${GPU_LANG} STREQUAL "HIP") # @@ -143,17 +144,17 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) # Find the intersection of the supported + detected architectures to # set the module architecture flags. # - set(${GPU_ARCHES}) + set(GPU_ARCHES) foreach (ARCH ${CMAKE_HIP_ARCHITECTURES}) - if (ARCH IN_LIST GPU_SUPPORTED_ARCHES_LIST) - list(APPEND ${GPU_ARCHES} ${ARCH}) + if (ARCH IN_LIST GPU_SUPPORTED_ARCHES) + list(APPEND GPU_ARCHES ${ARCH}) endif() endforeach() - if(NOT ${GPU_ARCHES}) + if(NOT GPU_ARCHES) message(FATAL_ERROR "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" - " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES_LIST}.") + " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES}.") endif() elseif(${GPU_LANG} STREQUAL "CUDA") @@ -199,7 +200,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}") # Initialize the architecture lists to empty. - set(${GPU_ARCHES}) + set(GPU_ARCHES) # Process each `gencode` flag. foreach(ARCH ${_CUDA_ARCH_FLAGS}) @@ -245,17 +246,18 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) # Check if the current version is in the supported arch list. string_to_ver(CODE_VER ${CODE_ARCH}) - if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES_LIST) + if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES) message(STATUS "discarding unsupported CUDA arch ${VER}.") continue() endif() # Add it to the arch list. - list(APPEND ${GPU_ARCHES} "${CODE_ARCH}${VIRT}") + list(APPEND GPU_ARCHES "${CODE_ARCH}${VIRT}") endforeach() endif() - message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}") -endmacro() + message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}") + set(${OUT_GPU_ARCHES} ${GPU_ARCHES} PARENT_SCOPE) +endfunction() # # Define a target named `GPU_MOD_NAME` for a single extension. The From baa1fa83b51351d93ecfcdaa27fc6e385de4c163 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 20:48:49 +0000 Subject: [PATCH 66/76] use underscores for variables set in macros --- cmake/utils.cmake | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 8adf37cdfcb57..56b3b2f0da059 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -9,12 +9,12 @@ macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS) if (NOT Python_FOUND) message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.") endif() - set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") - set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) - if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST) + set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}") + set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN}) + if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST) message(FATAL_ERROR - "Python version (${VER}) is not one of the supported versions: " - "${SUPPORTED_VERSIONS}.") + "Python version (${_VER}) is not one of the supported versions: " + "${_SUPPORTED_VERSIONS_LIST}.") endif() message(STATUS "Found python matching: ${EXECUTABLE}.") endmacro() @@ -42,9 +42,9 @@ endfunction() # Run `EXPR` in python after importing `PKG`. Use the result of this to extend # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported. macro (append_cmake_prefix_path PKG EXPR) - run_python(PREFIX_PATH + run_python(_PREFIX_PATH "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path") - list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH}) + list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH}) endmacro() # From 5c0bd3238e1da21b3b318b5cd28ddfdf7d1c6b82 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 21:37:48 +0000 Subject: [PATCH 67/76] turn override_gpu_arches back into a macro --- CMakeLists.txt | 2 +- cmake/utils.cmake | 68 ++++++++++++++++++++++++----------------------- setup.py | 2 +- 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 225c1832ea31a..dfb1bef0ed5a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -273,7 +273,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (ARCH_VER VERSION_LESS 8.0) message(STATUS "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.") - set(ENABLE_PUNICA false) +# set(ENABLE_PUNICA false) break() endif() endforeach() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 56b3b2f0da059..686eea878632f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -130,8 +130,11 @@ endmacro() # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in # `GPU_ARCHES`. # -function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) - message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES}") +# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`. +# +macro(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) + set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) + message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}") if (${GPU_LANG} STREQUAL "HIP") # @@ -144,17 +147,17 @@ function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) # Find the intersection of the supported + detected architectures to # set the module architecture flags. # - set(GPU_ARCHES) - foreach (ARCH ${CMAKE_HIP_ARCHITECTURES}) - if (ARCH IN_LIST GPU_SUPPORTED_ARCHES) - list(APPEND GPU_ARCHES ${ARCH}) + set(${GPU_ARCHES}) + foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES}) + if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) + list(APPEND ${GPU_ARCHES} ${_ARCH}) endif() endforeach() - if(NOT GPU_ARCHES) + if(NOT ${GPU_ARCHES}) message(FATAL_ERROR "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" - " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES}.") + " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") endif() elseif(${GPU_LANG} STREQUAL "CUDA") @@ -200,63 +203,62 @@ function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}") # Initialize the architecture lists to empty. - set(GPU_ARCHES) + set(${GPU_ARCHES}) # Process each `gencode` flag. - foreach(ARCH ${_CUDA_ARCH_FLAGS}) + foreach(_ARCH ${_CUDA_ARCH_FLAGS}) # For each flag, extract the version number and whether it refers to PTX # or native code. # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding # for that match. - string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH}) - if (COMPUTE) - set(COMPUTE ${CMAKE_MATCH_1}) + string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH}) + if (_COMPUTE) + set(_COMPUTE ${CMAKE_MATCH_1}) endif() - string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH}) - if (SM) - set(SM ${CMAKE_MATCH_1}) + string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH}) + if (_SM) + set(_SM ${CMAKE_MATCH_1}) endif() - string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH}) - if (CODE) - set(CODE ${CMAKE_MATCH_1}) + string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH}) + if (_CODE) + set(_CODE ${CMAKE_MATCH_1}) endif() # Make sure the virtual architecture can be matched. - if (NOT COMPUTE) + if (NOT _COMPUTE) message(FATAL_ERROR - "Could not determine virtual architecture from: ${ARCH}.") + "Could not determine virtual architecture from: ${_ARCH}.") endif() # One of sm_ or compute_ must exist. - if ((NOT SM) AND (NOT CODE)) + if ((NOT _SM) AND (NOT _CODE)) message(FATAL_ERROR - "Could not determine a codegen architecture from: ${ARCH}.") + "Could not determine a codegen architecture from: ${_ARCH}.") endif() - if (SM) - set(VIRT "") - set(CODE_ARCH ${SM}) + if (_SM) + set(_VIRT "") + set(_CODE_ARCH ${_SM}) else() - set(VIRT "-virtual") - set(CODE_ARCH ${CODE}) + set(_VIRT "-virtual") + set(_CODE_ARCH ${_CODE}) endif() # Check if the current version is in the supported arch list. - string_to_ver(CODE_VER ${CODE_ARCH}) - if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES) - message(STATUS "discarding unsupported CUDA arch ${VER}.") + string_to_ver(_CODE_VER ${_CODE_ARCH}) + if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST) + message(STATUS "discarding unsupported CUDA arch ${_VER}.") continue() endif() # Add it to the arch list. - list(APPEND GPU_ARCHES "${CODE_ARCH}${VIRT}") + list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}") endforeach() endif() message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}") - set(${OUT_GPU_ARCHES} ${GPU_ARCHES} PARENT_SCOPE) endfunction() # diff --git a/setup.py b/setup.py index 69c909ea92817..2a412d24c4b17 100644 --- a/setup.py +++ b/setup.py @@ -191,7 +191,7 @@ def _install_punica() -> bool: for i in range(device_count): major, minor = torch.cuda.get_device_capability(i) if major < 8: - install_punica = False +# install_punica = False break return install_punica From 34e1f00045b1143fb0ab4e7bf14ca091f20bf63f Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 21:40:55 +0000 Subject: [PATCH 68/76] fix override_gpu_arches --- cmake/utils.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 686eea878632f..e3b994b8fa61b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -132,7 +132,7 @@ endmacro() # # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`. # -macro(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) +macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN}) message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}") @@ -259,7 +259,7 @@ macro(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) endforeach() endif() message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}") -endfunction() +endmacro() # # Define a target named `GPU_MOD_NAME` for a single extension. The From 384d897806ee2237237593f159de7bbd1e8df1f0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 21:53:21 +0000 Subject: [PATCH 69/76] fixes for punica configuration --- CMakeLists.txt | 8 ++++---- cmake/utils.cmake | 2 +- setup.py | 5 ++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dfb1bef0ed5a0..82bfcd68d11d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,17 +263,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") add_dependencies(default _moe_C) set(ENABLE_PUNICA) - # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=1 or + # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or # VLLM_INSTALL_PUNICA_KERNELS is set in the environment. - if (DEFINED VLLM_INSTALL_PUNICA_KERNELS OR ENV{VLLM_INSTALL_PUNICA_KERNELS}) - set(ENABLE_PUNICA true) + if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS) + set(ENABLE_PUNICA ON) foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE}) string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH}) string_to_ver(ARCH_VER ${ARCH_VER_STR}) if (ARCH_VER VERSION_LESS 8.0) message(STATUS "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.") -# set(ENABLE_PUNICA false) + set(ENABLE_PUNICA OFF) break() endif() endforeach() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index e3b994b8fa61b..a50f0e1e5a76f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -258,7 +258,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}") endforeach() endif() - message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}") + message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}") endmacro() # diff --git a/setup.py b/setup.py index 2a412d24c4b17..a71d3ece7a282 100644 --- a/setup.py +++ b/setup.py @@ -116,6 +116,9 @@ def configure(self, ext: CMakeExtension) -> None: # match. cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)] + if _install_punica(): + cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON'] + # # Setup parallelism and build tool # @@ -191,7 +194,7 @@ def _install_punica() -> bool: for i in range(device_count): major, minor = torch.cuda.get_device_capability(i) if major < 8: -# install_punica = False + install_punica = False break return install_punica From 59204f69c65b99efabdaef6c56110d229dd17b4e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 22:06:47 +0000 Subject: [PATCH 70/76] debugging --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 82bfcd68d11d0..874477ddb3ce5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,8 +267,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # VLLM_INSTALL_PUNICA_KERNELS is set in the environment. if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS) set(ENABLE_PUNICA ON) + message(STATUS "native arches: ${CMAKE_CUDA_ARCHITECTURES_NATIVE}") foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE}) string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH}) + message(STATUS "ARCH_VER_STR: ${ARCH_VER_STR}") string_to_ver(ARCH_VER ${ARCH_VER_STR}) if (ARCH_VER VERSION_LESS 8.0) message(STATUS From ec6ae13068c750c27d25db4bb7e8fc975e5ee5e3 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 22:11:49 +0000 Subject: [PATCH 71/76] debug --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 874477ddb3ce5..65c25c4b4660e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -267,6 +267,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # VLLM_INSTALL_PUNICA_KERNELS is set in the environment. if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS) set(ENABLE_PUNICA ON) + message(STATUS "arches: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "native arches: ${CMAKE_CUDA_ARCHITECTURES_NATIVE}") foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE}) string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH}) From fa42ea9800272b27b108c4ff7cda646e4a78a9c0 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Thu, 29 Feb 2024 22:31:19 +0000 Subject: [PATCH 72/76] revamp punica installation logic --- CMakeLists.txt | 45 +++++++++++++++++---------------------------- setup.py | 10 ++-------- 2 files changed, 19 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 65c25c4b4660e..770b830637649 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -229,14 +229,19 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA") message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}") endif() -define_gpu_extension_target( - _punica_C - DESTINATION vllm - LANGUAGE ${VLLM_GPU_LANG} - SOURCES ${VLLM_PUNICA_EXT_SRC} - COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS} - ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES} - WITH_SOABI) +if (VLLM_PUNICA_GPU_ARCHES) + define_gpu_extension_target( + _punica_C + DESTINATION vllm + LANGUAGE ${VLLM_GPU_LANG} + SOURCES ${VLLM_PUNICA_EXT_SRC} + COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS} + ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES} + WITH_SOABI) +else() + message(WARNING "Unable to create _punica_C target because none of the " + "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0") +endif() # # Add the `default` target which detects which extensions should be @@ -262,27 +267,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Enabling moe extension.") add_dependencies(default _moe_C) - set(ENABLE_PUNICA) # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or - # VLLM_INSTALL_PUNICA_KERNELS is set in the environment. - if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS) - set(ENABLE_PUNICA ON) - message(STATUS "arches: ${CMAKE_CUDA_ARCHITECTURES}") - message(STATUS "native arches: ${CMAKE_CUDA_ARCHITECTURES_NATIVE}") - foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE}) - string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH}) - message(STATUS "ARCH_VER_STR: ${ARCH_VER_STR}") - string_to_ver(ARCH_VER ${ARCH_VER_STR}) - if (ARCH_VER VERSION_LESS 8.0) - message(STATUS - "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.") - set(ENABLE_PUNICA OFF) - break() - endif() - endforeach() - endif() - - if (ENABLE_PUNICA) + # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and + # there are supported target arches. + if (VLLM_PUNICA_GPU_ARCHES AND + (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS)) message(STATUS "Enabling punica extension.") add_dependencies(default _punica_C) endif() diff --git a/setup.py b/setup.py index a71d3ece7a282..aecc8a8c54bd4 100644 --- a/setup.py +++ b/setup.py @@ -189,14 +189,8 @@ def _is_cuda() -> bool: def _install_punica() -> bool: - install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) - device_count = torch.cuda.device_count() - for i in range(device_count): - major, minor = torch.cuda.get_device_capability(i) - if major < 8: - install_punica = False - break - return install_punica + print(f"DEBUG device_count = {torch.cuda.device_count()}") + return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) def get_path(*filepath) -> str: From c3be2cf238d36be359d3e4cb749fba068dbd4b5c Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 Mar 2024 18:41:24 +0000 Subject: [PATCH 73/76] remove debugging cruft + add more detail to comment --- cmake/utils.cmake | 21 +++++++++++++-------- setup.py | 1 - 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index a50f0e1e5a76f..bb222bb437b1d 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -265,19 +265,24 @@ endmacro() # Define a target named `GPU_MOD_NAME` for a single extension. The # arguments are: # -# DESTINATION - module destination directory. -# LANGUAGE - the GPU language for this module, e.g CUDA, HIP, +# DESTINATION - Module destination directory. +# LANGUAGE - The GPU language for this module, e.g CUDA, HIP, # etc. -# SOURCES - list of source files relative to CMakeLists.txt +# SOURCES - List of source files relative to CMakeLists.txt # directory. -# ARCHITECTURES - a list of target GPU architectures in cmake +# +# Optional arguments: +# +# ARCHITECTURES - A list of target GPU architectures in cmake # format. # Refer `CMAKE_CUDA_ARCHITECTURES` documentation # and `CMAKE_HIP_ARCHITECTURES` for more info. -# COMPILE_FLAGS - extra compiler flags passed to NVCC/hip. -# INCLUDE_DIRECTORIES - extra include directories. -# LINK_LIBRARIES - extra link libraries. -# WITH_SOABI - generate library with python SOABI suffix name. +# ARCHITECTURES will use cmake's defaults if +# not provided. +# COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip. +# INCLUDE_DIRECTORIES - Extra include directories. +# LINK_LIBRARIES - Extra link libraries. +# WITH_SOABI - Generate library with python SOABI suffix name. # # Note: optimization level/debug info is set via cmake build type. # diff --git a/setup.py b/setup.py index aecc8a8c54bd4..4309136c5b460 100644 --- a/setup.py +++ b/setup.py @@ -189,7 +189,6 @@ def _is_cuda() -> bool: def _install_punica() -> bool: - print(f"DEBUG device_count = {torch.cuda.device_count()}") return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) From 2225ece7e8b4ba834e6c67a45028db38d596f3c3 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 6 Mar 2024 18:46:57 +0000 Subject: [PATCH 74/76] merge marlin changes --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 770b830637649..29a531d44a9d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,7 @@ set(VLLM_EXT_SRC if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_EXT_SRC "csrc/quantization/awq/gemm_kernels.cu" + "csrc/quantization/marlin/marlin_cuda_kernel.cu" "csrc/custom_all_reduce.cu") endif() From 5393d4cd3ff0b5765eb316918879abccb34ba83e Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Wed, 13 Mar 2024 19:09:22 +0000 Subject: [PATCH 75/76] fix merge error --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4309136c5b460..b31097b69f24d 100644 --- a/setup.py +++ b/setup.py @@ -308,7 +308,7 @@ def get_requirements() -> List[str]: if _is_cuda(): with open(get_path("requirements.txt")) as f: requirements = f.read().strip().split("\n") - if nvcc_cuda_version <= Version("11.8"): + if get_nvcc_cuda_version() <= Version("11.8"): # replace cupy-cuda12x with cupy-cuda11x for cuda 11.x for i in range(len(requirements)): if requirements[i].startswith("cupy-cuda12x"): From af254ce18b2f2dddb83e84a4e5c7d03f99e2e4da Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Mon, 18 Mar 2024 15:22:56 +0000 Subject: [PATCH 76/76] merge setup.py --- setup.py | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index b31097b69f24d..88787334be21a 100644 --- a/setup.py +++ b/setup.py @@ -184,31 +184,10 @@ def _is_neuron() -> bool: return torch_neuronx_installed -def _is_cuda() -> bool: - return (torch.version.cuda is not None) and not _is_neuron() - - def _install_punica() -> bool: return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) -def get_path(*filepath) -> str: - return os.path.join(ROOT_DIR, *filepath) - - -def find_version(filepath: str) -> str: - """Extract version information from the given filepath. - - Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py - """ - with open(filepath) as fp: - version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", - fp.read(), re.M) - if version_match: - return version_match.group(1) - raise RuntimeError("Unable to find version string.") - - def get_hipcc_rocm_version(): # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], @@ -263,11 +242,28 @@ def get_nvcc_cuda_version() -> Version: return nvcc_cuda_version +def get_path(*filepath) -> str: + return os.path.join(ROOT_DIR, *filepath) + + +def find_version(filepath: str) -> str: + """Extract version information from the given filepath. + + Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py + """ + with open(filepath) as fp: + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", + fp.read(), re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") + + def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) if _is_cuda(): - cuda_version = str(nvcc_cuda_version) + cuda_version = str(get_nvcc_cuda_version()) if cuda_version != MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] version += f"+cu{cuda_version_str}" @@ -283,11 +279,6 @@ def get_vllm_version() -> str: if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"+neuron{neuron_version_str}" - elif _is_cuda(): - cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - version += f"+cu{cuda_version_str}" else: raise RuntimeError("Unknown runtime environment")