From e3a60a43045c3a232a0d0d40f4db13b8f7f46e85 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sat, 10 Feb 2024 02:59:13 +0000
Subject: [PATCH 01/76] cmake based build system

---
 pyproject.toml         |   1 +
 requirements-build.txt |   4 +-
 requirements-rocm.txt  |   1 +
 requirements.txt       |   1 +
 setup.py               | 468 ++++++++++++++---------------------------
 5 files changed, 165 insertions(+), 310 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e0a01215ef997..b6d7649477dcc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,7 @@
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
+    "cmake>=3.21",
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
diff --git a/requirements-build.txt b/requirements-build.txt
index 7e7e48a1313e5..8975f477fe96c 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,6 +1,6 @@
-# Should be mirrored in pyproject.toml
+cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
 torch==2.1.2
-wheel
\ No newline at end of file
+wheel
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index d5a3bd423b6b3..c30479e40f521 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -1,3 +1,4 @@
+cmake>=3.21
 ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
diff --git a/requirements.txt b/requirements.txt
index d6c33ad85da58..c9a5bd6619402 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+cmake>=3.21
 ninja  # For faster builds.
 psutil
 ray >= 2.9
diff --git a/setup.py b/setup.py
index 6f1f2faf54dbc..3ce85f2f9b35b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,23 +1,17 @@
-import contextlib
 import io
 import os
 import re
 import subprocess
-import warnings
-from pathlib import Path
-from typing import List, Set
+import sys
+from typing import List
 
 from packaging.version import parse, Version
-import setuptools
-import sys
+from setuptools import setup, find_packages, Extension
+from setuptools.command.build_ext import build_ext
+from shutil import which
 import torch
-import torch.utils.cpp_extension as torch_cpp_ext
-from torch.utils.cpp_extension import (
-    BuildExtension,
-    CUDAExtension,
-    CUDA_HOME,
-    ROCM_HOME,
-)
+# ROCM_HOME needed?
+from torch.utils.cpp_extension import CUDA_HOME
 
 ROOT_DIR = os.path.dirname(__file__)
 
@@ -32,10 +26,109 @@
 
 MAIN_CUDA_VERSION = "12.1"
 
-# Supported NVIDIA GPU architectures.
-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx942", "gfx1100"}
-# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
+
+def is_ccache_available() -> bool:
+    return which("ccacheX") is not None
+
+
+def is_ninja_available() -> bool:
+    return which("ninja") is not None
+
+
+def remove_prefix(text, prefix):
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
+
+
+class CMakeExtension(Extension):
+
+    def __init__(self, name, cmake_lists_dir='.', **kwa):
+        Extension.__init__(self, name, sources=[], **kwa)
+        self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
+
+
+class cmake_build_ext(build_ext):
+
+    def build_extensions(self):
+        # Ensure that CMake is present and working
+        try:
+            subprocess.check_output(['cmake', '--version'])
+        except OSError as e:
+            raise RuntimeError('Cannot find CMake executable') from e
+
+        for ext in self.extensions:
+
+            extdir = os.path.abspath(
+                os.path.dirname(self.get_ext_fullpath(ext.name)))
+
+            # Note: optimization level + debug info set by the build type
+            cfg = os.getenv("VLLM_BUILD_TYPE", "RelWithDebInfo")
+
+            cmake_args = [
+                '-DCMAKE_BUILD_TYPE=%s' % cfg,
+                # Ask CMake to place the resulting library in the directory
+                # containing the extension
+                '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(
+                    cfg.upper(), extdir),
+                # Other intermediate static libraries are placed in a
+                # temporary build directory instead
+                '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format(
+                    cfg.upper(), self.build_temp),
+                # Hint CMake to use the same Python executable that
+                # is launching the build, prevents possible mismatching if
+                # multiple versions of Python are installed
+                '-DPYTHON_EXECUTABLE={}'.format(sys.executable),
+            ]
+
+            # TODO: change default to 0
+            verbose = bool(int(os.getenv('VERBOSE', '1')))
+            if verbose:
+                cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
+
+            if is_ccache_available():
+                cmake_args += [
+                    '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
+                    '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
+                ]
+
+            #
+            # Setup parallelism
+            #
+            num_jobs = os.cpu_count()
+            nvcc_cuda_version = get_nvcc_cuda_version()
+            if nvcc_cuda_version >= Version("11.2"):
+                nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
+                num_jobs = max(1, round(num_jobs / (nvcc_threads / 4)))
+                cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
+
+            if not os.path.exists(self.build_temp):
+                os.makedirs(self.build_temp)
+
+            ext_target_name = remove_prefix(ext.name, "vllm.")
+
+            if is_ninja_available():
+                build_tool = ['-G', 'Ninja']
+                cmake_args += [
+                    '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
+                    '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
+                ]
+                build_jobs = []
+            else:
+                build_tool = ['-G', 'Unix Makefiles']
+                build_jobs = ['-j', str(num_jobs)]
+
+            # Config
+            subprocess.check_call(['cmake', ext.cmake_lists_dir] + build_tool +
+                                  cmake_args,
+                                  cwd=self.build_temp)
+
+            # Build
+            build_args = [
+                '--build', '.', '--config', cfg, '--target', ext_target_name
+            ]
+            subprocess.check_call(['cmake'] + build_args + build_jobs,
+                                  cwd=self.build_temp)
 
 
 def _is_cuda() -> bool:
@@ -55,26 +148,36 @@ def _is_neuron() -> bool:
     return torch_neuronx_installed
 
 
-# Compiler flags.
-CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
-# TODO(woosuk): Should we use -O3?
-NVCC_FLAGS = ["-O2", "-std=c++17"]
+def _is_cuda() -> bool:
+    return (torch.version.cuda is not None) and not _is_neuron()
+
+
+def _install_punica() -> bool:
+    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
+    device_count = torch.cuda.device_count()
+    for i in range(device_count):
+        major, minor = torch.cuda.get_device_capability(i)
+        if major < 8:
+            install_punica = False
+            break
+    return install_punica
 
-if _is_hip():
-    if ROCM_HOME is None:
-        raise RuntimeError("Cannot find ROCM_HOME. "
-                           "ROCm must be available to build the package.")
-    NVCC_FLAGS += ["-DUSE_ROCM"]
-    NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"]
-    NVCC_FLAGS += ["-U__HIP_NO_HALF_OPERATORS__"]
 
-if _is_cuda() and CUDA_HOME is None:
-    raise RuntimeError(
-        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+def get_path(*filepath) -> str:
+    return os.path.join(ROOT_DIR, *filepath)
 
-ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
-CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+
+def find_version(filepath: str) -> str:
+    """Extract version information from the given filepath.
+
+    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
+    """
+    with open(filepath) as fp:
+        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                                  fp.read(), re.M)
+        if version_match:
+            return version_match.group(1)
+        raise RuntimeError("Unable to find version string.")
 
 
 def get_hipcc_rocm_version():
@@ -99,11 +202,6 @@ def get_hipcc_rocm_version():
         return None
 
 
-def glob(pattern: str):
-    root = Path(__name__).parent
-    return [str(p) for p in root.glob(pattern)]
-
-
 def get_neuronxcc_version():
     import sysconfig
     site_dir = sysconfig.get_paths()["purelib"]
@@ -123,12 +221,12 @@ def get_neuronxcc_version():
         raise RuntimeError("Could not find HIP version in the output")
 
 
-def get_nvcc_cuda_version(cuda_dir: str) -> Version:
+def get_nvcc_cuda_version() -> Version:
     """Get the CUDA version from nvcc.
 
     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
     """
-    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+    nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
                                           universal_newlines=True)
     output = nvcc_output.split()
     release_idx = output.index("release") + 1
@@ -136,267 +234,6 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
     return nvcc_cuda_version
 
 
-def get_pytorch_rocm_arch() -> Set[str]:
-    """Get the cross section of Pytorch,and vllm supported gfx arches
-
-    ROCM can get the supported gfx architectures in one of two ways
-    Either through the PYTORCH_ROCM_ARCH env var, or output from
-    rocm_agent_enumerator.
-
-    In either case we can generate a list of supported arch's and
-    cross reference with VLLM's own ROCM_SUPPORTED_ARCHs.
-    """
-    env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)
-
-    # If we don't have PYTORCH_ROCM_ARCH specified pull the list from
-    # rocm_agent_enumerator
-    if env_arch_list is None:
-        command = "rocm_agent_enumerator"
-        env_arch_list = (subprocess.check_output(
-            [command]).decode('utf-8').strip().replace("\n", ";"))
-        arch_source_str = "rocm_agent_enumerator"
-    else:
-        arch_source_str = "PYTORCH_ROCM_ARCH env variable"
-
-    # List are separated by ; or space.
-    pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";"))
-
-    # Filter out the invalid architectures and print a warning.
-    arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS)
-
-    # If none of the specified architectures are valid, raise an error.
-    if not arch_list:
-        raise RuntimeError(
-            f"None of the ROCM architectures in {arch_source_str} "
-            f"({env_arch_list}) is supported. "
-            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
-    invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
-    if invalid_arch_list:
-        warnings.warn(
-            f"Unsupported ROCM architectures ({invalid_arch_list}) are "
-            f"excluded from the {arch_source_str} output "
-            f"({env_arch_list}). Supported ROCM architectures are: "
-            f"{ROCM_SUPPORTED_ARCHS}.",
-            stacklevel=2)
-    return arch_list
-
-
-def get_torch_arch_list() -> Set[str]:
-    # TORCH_CUDA_ARCH_LIST can have one or more architectures,
-    # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
-    # compiler to additionally include PTX code that can be runtime-compiled
-    # and executed on the 8.6 or newer architectures. While the PTX code will
-    # not give the best performance on the newer architectures, it provides
-    # forward compatibility.
-    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
-    if env_arch_list is None:
-        return set()
-
-    # List are separated by ; or space.
-    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
-    if not torch_arch_list:
-        return set()
-
-    # Filter out the invalid architectures and print a warning.
-    valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
-        {s + "+PTX"
-         for s in NVIDIA_SUPPORTED_ARCHS})
-    arch_list = torch_arch_list.intersection(valid_archs)
-    # If none of the specified architectures are valid, raise an error.
-    if not arch_list:
-        raise RuntimeError(
-            "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
-            f"variable ({env_arch_list}) is supported. "
-            f"Supported CUDA architectures are: {valid_archs}.")
-    invalid_arch_list = torch_arch_list - valid_archs
-    if invalid_arch_list:
-        warnings.warn(
-            f"Unsupported CUDA architectures ({invalid_arch_list}) are "
-            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
-            f"({env_arch_list}). Supported CUDA architectures are: "
-            f"{valid_archs}.",
-            stacklevel=2)
-    return arch_list
-
-
-if _is_hip():
-    rocm_arches = get_pytorch_rocm_arch()
-    NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches]
-else:
-    # First, check the TORCH_CUDA_ARCH_LIST environment variable.
-    compute_capabilities = get_torch_arch_list()
-
-if _is_cuda() and not compute_capabilities:
-    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
-    # GPUs on the current machine.
-    device_count = torch.cuda.device_count()
-    for i in range(device_count):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 7:
-            raise RuntimeError(
-                "GPUs with compute capability below 7.0 are not supported.")
-        compute_capabilities.add(f"{major}.{minor}")
-
-ext_modules = []
-
-if _is_cuda():
-    nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
-    if not compute_capabilities:
-        # If no GPU is specified nor available, add all supported architectures
-        # based on the NVCC CUDA version.
-        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
-        if nvcc_cuda_version < Version("11.1"):
-            compute_capabilities.remove("8.6")
-        if nvcc_cuda_version < Version("11.8"):
-            compute_capabilities.remove("8.9")
-            compute_capabilities.remove("9.0")
-    # Validate the NVCC CUDA version.
-    if nvcc_cuda_version < Version("11.0"):
-        raise RuntimeError(
-            "CUDA 11.0 or higher is required to build the package.")
-    if (nvcc_cuda_version < Version("11.1")
-            and any(cc.startswith("8.6") for cc in compute_capabilities)):
-        raise RuntimeError(
-            "CUDA 11.1 or higher is required for compute capability 8.6.")
-    if nvcc_cuda_version < Version("11.8"):
-        if any(cc.startswith("8.9") for cc in compute_capabilities):
-            # CUDA 11.8 is required to generate the code targeting compute
-            # capability 8.9. However, GPUs with compute capability 8.9 can
-            # also run the code generated by the previous versions of CUDA 11
-            # and targeting compute capability 8.0. Therefore, if CUDA 11.8
-            # is not available, we target compute capability 8.0 instead of 8.9.
-            warnings.warn(
-                "CUDA 11.8 or higher is required for compute capability 8.9. "
-                "Targeting compute capability 8.0 instead.",
-                stacklevel=2)
-            compute_capabilities = set(cc for cc in compute_capabilities
-                                       if not cc.startswith("8.9"))
-            compute_capabilities.add("8.0+PTX")
-        if any(cc.startswith("9.0") for cc in compute_capabilities):
-            raise RuntimeError(
-                "CUDA 11.8 or higher is required for compute capability 9.0.")
-
-    NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy()
-
-    # Add target compute capabilities to NVCC flags.
-    for capability in compute_capabilities:
-        num = capability[0] + capability[2]
-        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
-        if capability.endswith("+PTX"):
-            NVCC_FLAGS += [
-                "-gencode", f"arch=compute_{num},code=compute_{num}"
-            ]
-        if int(capability[0]) >= 8:
-            NVCC_FLAGS_PUNICA += [
-                "-gencode", f"arch=compute_{num},code=sm_{num}"
-            ]
-            if capability.endswith("+PTX"):
-                NVCC_FLAGS_PUNICA += [
-                    "-gencode", f"arch=compute_{num},code=compute_{num}"
-                ]
-
-    # Use NVCC threads to parallelize the build.
-    if nvcc_cuda_version >= Version("11.2"):
-        nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
-        num_threads = min(os.cpu_count(), nvcc_threads)
-        NVCC_FLAGS += ["--threads", str(num_threads)]
-
-    if nvcc_cuda_version >= Version("11.8"):
-        NVCC_FLAGS += ["-DENABLE_FP8_E5M2"]
-
-    # changes for punica kernels
-    NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
-    REMOVE_NVCC_FLAGS = [
-        '-D__CUDA_NO_HALF_OPERATORS__',
-        '-D__CUDA_NO_HALF_CONVERSIONS__',
-        '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
-        '-D__CUDA_NO_HALF2_OPERATORS__',
-    ]
-    for flag in REMOVE_NVCC_FLAGS:
-        with contextlib.suppress(ValueError):
-            torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)
-
-    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
-    device_count = torch.cuda.device_count()
-    for i in range(device_count):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 8:
-            install_punica = False
-            break
-    if install_punica:
-        ext_modules.append(
-            CUDAExtension(
-                name="vllm._punica_C",
-                sources=["csrc/punica/punica_ops.cc"] +
-                glob("csrc/punica/bgmv/*.cu"),
-                extra_compile_args={
-                    "cxx": CXX_FLAGS,
-                    "nvcc": NVCC_FLAGS_PUNICA,
-                },
-            ))
-elif _is_neuron():
-    neuronxcc_version = get_neuronxcc_version()
-
-vllm_extension_sources = [
-    "csrc/cache_kernels.cu",
-    "csrc/attention/attention_kernels.cu",
-    "csrc/pos_encoding_kernels.cu",
-    "csrc/activation_kernels.cu",
-    "csrc/layernorm_kernels.cu",
-    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
-    "csrc/quantization/gptq/q_gemm.cu",
-    "csrc/cuda_utils_kernels.cu",
-    "csrc/moe_align_block_size_kernels.cu",
-    "csrc/pybind.cpp",
-]
-
-if _is_cuda():
-    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
-    vllm_extension_sources.append(
-        "csrc/quantization/marlin/marlin_cuda_kernel.cu")
-    vllm_extension_sources.append("csrc/custom_all_reduce.cu")
-
-    # Add MoE kernels.
-    ext_modules.append(
-        CUDAExtension(
-            name="vllm._moe_C",
-            sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"),
-            extra_compile_args={
-                "cxx": CXX_FLAGS,
-                "nvcc": NVCC_FLAGS,
-            },
-        ))
-
-if not _is_neuron():
-    vllm_extension = CUDAExtension(
-        name="vllm._C",
-        sources=vllm_extension_sources,
-        extra_compile_args={
-            "cxx": CXX_FLAGS,
-            "nvcc": NVCC_FLAGS,
-        },
-        libraries=["cuda"] if _is_cuda() else [],
-    )
-    ext_modules.append(vllm_extension)
-
-
-def get_path(*filepath) -> str:
-    return os.path.join(ROOT_DIR, *filepath)
-
-
-def find_version(filepath: str) -> str:
-    """Extract version information from the given filepath.
-
-    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
-    """
-    with open(filepath) as fp:
-        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
-                                  fp.read(), re.M)
-        if version_match:
-            return version_match.group(1)
-        raise RuntimeError("Unable to find version string.")
-
-
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
 
@@ -413,10 +250,15 @@ def get_vllm_version() -> str:
             version += f"+rocm{rocm_version_str}"
     elif _is_neuron():
         # Get the Neuron version
-        neuron_version = str(neuronxcc_version)
+        neuron_version = str(get_neuronxcc_version())
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_cuda():
+        cuda_version = str(get_nvcc_cuda_version())
+        if cuda_version != MAIN_CUDA_VERSION:
+            cuda_version_str = cuda_version.replace(".", "")[:3]
+            version += f"+cu{cuda_version_str}"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -456,14 +298,24 @@ def get_requirements() -> List[str]:
     return requirements
 
 
+ext_modules = []
+
+if _is_cuda():
+    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+
+    if _install_punica():
+        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
+
+if not _is_neuron():
+    ext_modules.append(CMakeExtension(name="vllm._C"))
+
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
 if os.environ.get("VLLM_USE_PRECOMPILED"):
-    ext_modules = []
     package_data["vllm"].append("*.so")
 
-setuptools.setup(
+setup(
     name="vllm",
     version=get_vllm_version(),
     author="vLLM Team",
@@ -485,11 +337,11 @@ def get_requirements() -> List[str]:
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
-                                               "examples", "tests")),
+    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
+                                    "tests")),
     python_requires=">=3.8",
     install_requires=get_requirements(),
     ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
     package_data=package_data,
 )

From b500bb44da85f44dad3a215f12c7a00536e64b64 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sat, 10 Feb 2024 03:07:18 +0000
Subject: [PATCH 02/76] comment out newer bits

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3ce85f2f9b35b..e955c70d80c19 100644
--- a/setup.py
+++ b/setup.py
@@ -301,7 +301,7 @@ def get_requirements() -> List[str]:
 ext_modules = []
 
 if _is_cuda():
-    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+#    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 
     if _install_punica():
         ext_modules.append(CMakeExtension(name="vllm._punica_C"))

From cc2407d2e31efed8c21e4315c0472e667edd9831 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 02:58:00 +0000
Subject: [PATCH 03/76] merge

---
 CMakeLists.txt | 156 +++++++++++++++++++++++++++++++++++++++++++++++++
 setup.py       |   8 +--
 2 files changed, 157 insertions(+), 7 deletions(-)
 create mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000000..9b78770ec7960
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,156 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(vllm_extensions LANGUAGES CXX)
+
+#
+# Find where user site-packages are installed and add it to cmake's search path.
+#
+
+if(NOT DEFINED PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+
+execute_process(
+    COMMAND
+      "${PYTHON_EXECUTABLE}" "-c"
+      "import site; print(site.getusersitepackages())"
+    OUTPUT_VARIABLE SITE_PATH
+    ERROR_VARIABLE SITE_PATH_ERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+if(SITE_PATH STREQUAL "")
+  message(FATAL_ERROR "Failed to locate site-packages path,"
+                      " full error message:\n${SITE_PATH_ERR}")
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH})
+
+#
+# Find packages needed to compile
+#
+find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
+find_package(Torch 2.1.2 EXACT REQUIRED)
+append_torchlib_if_found(torch_python)
+find_package(MPI REQUIRED)
+
+execute_process(
+    COMMAND
+      "${PYTHON_EXECUTABLE}" "-c"
+      "import torch.utils.cpp_extension as torch_cpp_ext; print(' '.join(torch_cpp_ext.COMMON_NVCC_FLAGS))"
+    OUTPUT_VARIABLE TORCH_NVCC_FLAGS
+    ERROR_VARIABLE TORCH_NVCC_FLAGS_ERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+if(TORCH_NVCC_FLAGS STREQUAL "")
+  message(FATAL_ERROR "Unable to determine torch nvcc compiler flags,"
+                      " full error message:\n${TORCH_NVCC_FLAGS_ERR}")
+endif()
+
+string(STRIP ${TORCH_NVCC_FLAGS} TORCH_NVCC_FLAGS)
+list(APPEND NVCC_FLAGS ${TORCH_NVCC_FLAGS})
+
+set(PUNICA_NVCC_FLAGS "${NVCC_FLAGS}")
+foreach(OPT
+    "-D__CUDA_NO_HALF_OPERATORS__"
+    "-D__CUDA_NO_HALF_CONVERSIONS__"
+    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+    "-D__CUDA_NO_HALF2_OPERATORS__"
+  )
+  string(REPLACE ${OPT} "" PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS})
+endforeach()
+string(STRIP ${PUNICA_NVCC_FLAGS} PUNICA_NVCC_FLAGS)
+
+if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+  list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2")
+endif()
+
+#
+# Check for existence of CUDA/HIP language support
+#
+# https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html
+include(CheckLanguage)
+check_language(HIP)
+check_language(CUDA)
+
+if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND")
+  enable_language(HIP)
+  list(APPEND NVCC_FLAGS "-DUSE_ROCM -U__HIP_NO_HALF_CONVERSIONS__ -U__HIP_NO_HALF_OPERATORS__")
+
+  # TODO: intersect with this list?
+  if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
+    set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942")
+  endif()
+
+  foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
+    list(APPEND NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
+  endforeach()
+elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND")
+  enable_language(CUDA)
+  set(IS_CUDA true)
+
+  # TODO: parse TORCH_CUDA_ARCH_LIST -> CMAKE_CUDA_ARCHITECTURES?
+
+  # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES
+  # set_target_properties(tgt PROPERTIES CUDA_ARCHITECTURES "35;50;72")
+  # TODO: PTX stuff
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    # This indicates support for both real architectures (i.e, no ptx).
+    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
+  endif()
+else()
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+endif()
+
+if(NVCC_THREADS)
+  list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}")
+endif()
+
+#
+# Define target source files
+#
+
+set(VLLM_EXT_SRC
+  "csrc/cache_kernels.cu"
+  "csrc/attention/attention_kernels.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/cuda_utils_kernels.cu"
+  "csrc/moe_align_block_size_kernels.cu"
+  "csrc/pybind.cpp")
+
+if(IS_CUDA)
+  list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/custom_all_reduce.cu")
+endif()
+
+File(GLOB VLLM_MOE_EXT_SRC "csrc/moe/*.cu" "csrc/moe/*.cpp")
+File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp")
+
+#
+# Define targets
+#
+set(CMAKE_CXX_STANDARD 17)
+
+function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS)
+  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+  # Note: optimization level/debug info is set by build type
+  if (IS_CUDA)
+    set(CUDA_LANG "CUDA")
+  else()
+    set(CUDA_LANG "HIP")
+  endif()
+  target_compile_options(${MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_NVCC_FLAGS}>)
+  target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
+  target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS})
+  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
+endfunction()
+
+define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}")
+define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}")
+define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}")
diff --git a/setup.py b/setup.py
index e955c70d80c19..69fcfa047ef97 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,6 @@
 from setuptools.command.build_ext import build_ext
 from shutil import which
 import torch
-# ROCM_HOME needed?
 from torch.utils.cpp_extension import CUDA_HOME
 
 ROOT_DIR = os.path.dirname(__file__)
@@ -19,11 +18,6 @@
 assert sys.platform.startswith(
     "linux"), "vLLM only supports Linux platform (including WSL)."
 
-# If you are developing the C++ backend of vLLM, consider building vLLM with
-# `python setup.py develop` since it will give you incremental builds.
-# The downside is that this method is deprecated, see
-# https://github.com/pypa/setuptools/issues/917
-
 MAIN_CUDA_VERSION = "12.1"
 
 
@@ -301,7 +295,7 @@ def get_requirements() -> List[str]:
 ext_modules = []
 
 if _is_cuda():
-#    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 
     if _install_punica():
         ext_modules.append(CMakeExtension(name="vllm._punica_C"))

From c9ac7ad590b67781e4f50f5fee5158122a72b194 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 03:06:38 +0000
Subject: [PATCH 04/76] try adding CMakeLists.txt to MANIFEST.in

---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MANIFEST.in b/MANIFEST.in
index 0c897cf147f10..38c9e58b4e73e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include LICENSE
 include requirements.txt
+include CMakeLists.txt
 
 recursive-include csrc *

From 3123f57b73ee1e21a46a1bf68362c6753b5d96b4 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 03:08:23 +0000
Subject: [PATCH 05/76] try adding it to dockerfile

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 8be03b3567f0e..c2354ca1f470d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,6 +38,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
+COPY CMakeLists.txt CMakeLists.txt
 COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py

From a4d99ea8e442e886685da5775e8ec2cd1b0371b8 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 03:12:30 +0000
Subject: [PATCH 06/76] add another path to CMAKE_PREFIX_PATH

---
 CMakeLists.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b78770ec7960..2783a5d119501 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,26 @@ endif()
 
 list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH})
 
+###############
+
+execute_process(
+    COMMAND
+      "${PYTHON_EXECUTABLE}" "-c"
+      "import torch; print(torch.utils.cmake_prefix_path)"
+    OUTPUT_VARIABLE TORCH_PATH
+    ERROR_VARIABLE TORCH_PATH_ERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+if(TORCH_PATH STREQUAL "")
+  message(FATAL_ERROR "Failed to locate torch cmake_prefix_path,"
+                      " full error message:\n${TORCH_PATH_ERR}")
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH})
+
+###############
+
+
 #
 # Find packages needed to compile
 #

From 41dbdc959ee15b18e47b15a14ddcdd3a78a82b55 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 03:15:08 +0000
Subject: [PATCH 07/76] try again

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2783a5d119501..b1a8b7a6deacd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH})
 find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
 find_package(Torch 2.1.2 EXACT REQUIRED)
 append_torchlib_if_found(torch_python)
-find_package(MPI REQUIRED)
+find_package(MPI) # find_package(MPI REQUIRED)
 
 execute_process(
     COMMAND

From ac9d94bbdbb1e0ea974c47896d968ff544700cba Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 03:46:20 +0000
Subject: [PATCH 08/76] hack to test punica build

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1a8b7a6deacd..02f1c1d8eaa8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,7 +115,9 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND")
   # TODO: PTX stuff
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
     # This indicates support for both real architectures (i.e, no ptx).
-    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
+    # TODO: punica not supported for less than 8.0
+    #    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
+    set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90")
   endif()
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")

From 102675d1f0d41b630d5ad65d0ab0a319b3f902a5 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 03:49:51 +0000
Subject: [PATCH 09/76] try again

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02f1c1d8eaa8c..1f5fb846164e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,10 +115,10 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND")
   # TODO: PTX stuff
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
     # This indicates support for both real architectures (i.e, no ptx).
-    # TODO: punica not supported for less than 8.0
-    #    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
-    set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90")
+    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
   endif()
+  # TODO: punica not supported for less than 8.0
+  set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90")
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()

From dfbafe31beefdfdf1ce26e58c72906131486962a Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 03:53:23 +0000
Subject: [PATCH 10/76] try again

---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1f5fb846164e4..ea70486e6a672 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,8 +117,6 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND")
     # This indicates support for both real architectures (i.e, no ptx).
     set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
   endif()
-  # TODO: punica not supported for less than 8.0
-  set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90")
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
@@ -173,6 +171,10 @@ function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS)
   install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
 endfunction()
 
+# TODO: hacks punica not supported for less than 8.0
+set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90")
+set(CUDA_ARCHITECTURES "80;86;89;90")
+
 define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}")
 define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}")
 define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}")

From 39a8589b11df0c3af2ffa5af8731a2dad9bc2043 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 04:05:23 +0000
Subject: [PATCH 11/76] cleanup path stuff. try hacking arches again

---
 CMakeLists.txt | 57 +++++++++++++++++++-------------------------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea70486e6a672..d5d47167654f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,46 +3,31 @@ cmake_minimum_required(VERSION 3.21)
 project(vllm_extensions LANGUAGES CXX)
 
 #
-# Find where user site-packages are installed and add it to cmake's search path.
+# Find where user site-packages and torch are installed and add it to cmake's search path.
 #
 
-if(NOT DEFINED PYTHON_EXECUTABLE)
-  set(PYTHON_EXECUTABLE python3)
-endif()
-
-execute_process(
-    COMMAND
-      "${PYTHON_EXECUTABLE}" "-c"
-      "import site; print(site.getusersitepackages())"
-    OUTPUT_VARIABLE SITE_PATH
-    ERROR_VARIABLE SITE_PATH_ERR
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-if(SITE_PATH STREQUAL "")
-  message(FATAL_ERROR "Failed to locate site-packages path,"
-                      " full error message:\n${SITE_PATH_ERR}")
-endif()
-
-list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH})
-
-###############
+function (append_cmake_prefix_path PKG EXPR)
+  if(NOT DEFINED PYTHON_EXECUTABLE)
+    set(PYTHON_EXECUTABLE python3)
+  endif()
 
-execute_process(
+  execute_process(
     COMMAND
-      "${PYTHON_EXECUTABLE}" "-c"
-      "import torch; print(torch.utils.cmake_prefix_path)"
-    OUTPUT_VARIABLE TORCH_PATH
-    ERROR_VARIABLE TORCH_PATH_ERR
+    "${PYTHON_EXECUTABLE}" "-c" "import ${PKG}; print(${EXPR})"
+    OUTPUT_VARIABLE PREFIX_PATH
+    ERROR_VARIABLE PREFIX_PATH_ERR
     OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-if(TORCH_PATH STREQUAL "")
-  message(FATAL_ERROR "Failed to locate torch cmake_prefix_path,"
-                      " full error message:\n${TORCH_PATH_ERR}")
-endif()
+  if(PREFIX_PATH STREQUAL "")
+    message(FATAL_ERROR "Failed to locate ${PKG} path,"
+            " full error message:\n${PREFIX_PATH_ERR}")
+  endif()
 
-list(APPEND CMAKE_PREFIX_PATH ${TORCH_PATH})
+  list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH})
+endfunction()
 
-###############
+append_cmake_prefix_path("site" "site.getusersitepackages()")
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
 
 #
@@ -84,6 +69,10 @@ if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
   list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2")
 endif()
 
+# hack arches to test
+# TODO: need to strip out cuda arches < 8.0 for punica
+set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90")
+
 #
 # Check for existence of CUDA/HIP language support
 #
@@ -171,10 +160,6 @@ function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS)
   install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
 endfunction()
 
-# TODO: hacks punica not supported for less than 8.0
-set(CMAKE_CUDA_ARCHITECTURES "80;86;89;90")
-set(CUDA_ARCHITECTURES "80;86;89;90")
-
 define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}")
 define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}")
 define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}")

From f66b286d419d42f8ea29dd152c00e7ee6cf11681 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 04:15:40 +0000
Subject: [PATCH 12/76] fix typo

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5d47167654f8..4013e5fb3b3d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ function (append_cmake_prefix_path PKG EXPR)
             " full error message:\n${PREFIX_PATH_ERR}")
   endif()
 
-  list(APPEND CMAKE_PREFIX_PATH ${SITE_PATH})
+  list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
 endfunction()
 
 append_cmake_prefix_path("site" "site.getusersitepackages()")

From b2784e0d852f9931edb5b9444ea5601c18669446 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sun, 11 Feb 2024 04:27:50 +0000
Subject: [PATCH 13/76] change function to macro

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4013e5fb3b3d9..3d69efdbbd63f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ project(vllm_extensions LANGUAGES CXX)
 # Find where user site-packages and torch are installed and add it to cmake's search path.
 #
 
-function (append_cmake_prefix_path PKG EXPR)
+macro (append_cmake_prefix_path PKG EXPR)
   if(NOT DEFINED PYTHON_EXECUTABLE)
     set(PYTHON_EXECUTABLE python3)
   endif()
@@ -24,7 +24,7 @@ function (append_cmake_prefix_path PKG EXPR)
   endif()
 
   list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
-endfunction()
+endmacro()
 
 append_cmake_prefix_path("site" "site.getusersitepackages()")
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")

From 5464e3072ee22380b4c648cb8ccdfdbdc7205ff2 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 12 Feb 2024 22:25:35 +0000
Subject: [PATCH 14/76] flag hacking

---
 CMakeLists.txt | 106 +++++++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 42 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d69efdbbd63f..4ad10ad1e48b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,15 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-#
-# Find where user site-packages and torch are installed and add it to cmake's search path.
-#
-
+# add comment
 macro (append_cmake_prefix_path PKG EXPR)
-  if(NOT DEFINED PYTHON_EXECUTABLE)
-    set(PYTHON_EXECUTABLE python3)
-  endif()
-
   execute_process(
     COMMAND
     "${PYTHON_EXECUTABLE}" "-c" "import ${PKG}; print(${EXPR})"
@@ -26,22 +19,32 @@ macro (append_cmake_prefix_path PKG EXPR)
   list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
 endmacro()
 
-append_cmake_prefix_path("site" "site.getusersitepackages()")
-append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
-
+# add comment why it comes before append_cmake_prefix_path
+find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
+find_package(MPI)
 
 #
+# Find where user site-packages and torch are installed and add it to cmake's search path.
 # Find packages needed to compile
 #
-find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
+append_cmake_prefix_path("site" "site.getusersitepackages()")
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+
+
 find_package(Torch 2.1.2 EXACT REQUIRED)
 append_torchlib_if_found(torch_python)
-find_package(MPI) # find_package(MPI REQUIRED)
 
+#set(ENV{TORCH_CUDA_ARCH_LIST} "70;75;80;86;89;90")
+
+#
+# Setup NVCC flags
+#
+
+# add comment
 execute_process(
     COMMAND
       "${PYTHON_EXECUTABLE}" "-c"
-      "import torch.utils.cpp_extension as torch_cpp_ext; print(' '.join(torch_cpp_ext.COMMON_NVCC_FLAGS))"
+      "import torch.utils.cpp_extension as torch_cpp_ext; print(';'.join(torch_cpp_ext.COMMON_NVCC_FLAGS))"
     OUTPUT_VARIABLE TORCH_NVCC_FLAGS
     ERROR_VARIABLE TORCH_NVCC_FLAGS_ERR
     OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -51,27 +54,38 @@ if(TORCH_NVCC_FLAGS STREQUAL "")
                       " full error message:\n${TORCH_NVCC_FLAGS_ERR}")
 endif()
 
-string(STRIP ${TORCH_NVCC_FLAGS} TORCH_NVCC_FLAGS)
-list(APPEND NVCC_FLAGS ${TORCH_NVCC_FLAGS})
+set(NVCC_FLAGS ${TORCH_NVCC_FLAGS})
+
+if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+  list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2")
+endif()
+
+if(NVCC_THREADS)
+  list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}")
+endif()
 
-set(PUNICA_NVCC_FLAGS "${NVCC_FLAGS}")
+#
+# Copy flags+update for punica
+#
+set(PUNICA_NVCC_FLAGS ${NVCC_FLAGS})
 foreach(OPT
     "-D__CUDA_NO_HALF_OPERATORS__"
     "-D__CUDA_NO_HALF_CONVERSIONS__"
     "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
     "-D__CUDA_NO_HALF2_OPERATORS__"
   )
-  string(REPLACE ${OPT} "" PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS})
+  list(REMOVE_ITEM PUNICA_NVCC_FLAGS ${OPT})
 endforeach()
-string(STRIP ${PUNICA_NVCC_FLAGS} PUNICA_NVCC_FLAGS)
+#string(REPLACE "  " " " PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS})
+
+# remove gencode flags added by pytorch
+list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode")
+list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*")
 
-if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-  list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2")
-endif()
 
-# hack arches to test
-# TODO: need to strip out cuda arches < 8.0 for punica
-set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90")
+message("torch nvcc: ${CUDA_NVCC_FLAGS}")
+message("nvcc: ${NVCC_FLAGS}")
+message("punica nvcc: ${PUNICA_NVCC_FLAGS}")
 
 #
 # Check for existence of CUDA/HIP language support
@@ -79,15 +93,15 @@ set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90")
 # https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html
 include(CheckLanguage)
 check_language(HIP)
-check_language(CUDA)
+#check_language(CUDA)
 
 if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND")
   enable_language(HIP)
-  list(APPEND NVCC_FLAGS "-DUSE_ROCM -U__HIP_NO_HALF_CONVERSIONS__ -U__HIP_NO_HALF_OPERATORS__")
+  list(APPEND NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__")
 
   # TODO: intersect with this list?
   if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
-    set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942")
+    set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
   endif()
 
   foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
@@ -98,22 +112,23 @@ elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND")
   set(IS_CUDA true)
 
   # TODO: parse TORCH_CUDA_ARCH_LIST -> CMAKE_CUDA_ARCHITECTURES?
+  # cmake env var CUDAARCHS
 
   # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES
   # set_target_properties(tgt PROPERTIES CUDA_ARCHITECTURES "35;50;72")
   # TODO: PTX stuff
-  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    # This indicates support for both real architectures (i.e, no ptx).
-    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
-  endif()
+#  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+#    # This indicates support for both real architectures (i.e, no ptx).
+#    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
+#  endif()
+
+  set(VLLM_CUDA_ARCHES "70;75;80;86;89;90")
+  set(VLLM_PUNICA_CUDA_ARCHES "80;86;89;90")  # >8.0 of VLLM_CUDA_ARCHITECTURES
+
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
 
-if(NVCC_THREADS)
-  list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}")
-endif()
-
 #
 # Define target source files
 #
@@ -144,22 +159,29 @@ File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp")
 #
 set(CMAKE_CXX_STANDARD 17)
 
-function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS)
+#this doesn't seem to work
+#set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+# add comment
+function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES)
   Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+  #add_library(${MOD_NAME} MODULE ${MOD_SRC})
+  set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}")
   # Note: optimization level/debug info is set by build type
   if (IS_CUDA)
     set(CUDA_LANG "CUDA")
+#    target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${MOD_NVCC_FLAGS}>)
   else()
     set(CUDA_LANG "HIP")
+#    target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:${MOD_NVCC_FLAGS}>)
   endif()
-  target_compile_options(${MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_NVCC_FLAGS}>)
+  target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_NVCC_FLAGS}>)
   target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
   target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS})
   target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
   install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
 endfunction()
 
-define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}")
-define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}")
-define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}")
+define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
+define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
+define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}")

From 22e3803287fd2afe3f19a795ffdcbad9a89c73e1 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 12 Feb 2024 23:00:57 +0000
Subject: [PATCH 15/76] strip arches out of CMAKE_CUDA_FLAGS

---
 CMakeLists.txt | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ad10ad1e48b2..f868de2578521 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,14 +76,17 @@ foreach(OPT
   )
   list(REMOVE_ITEM PUNICA_NVCC_FLAGS ${OPT})
 endforeach()
-#string(REPLACE "  " " " PUNICA_NVCC_FLAGS ${PUNICA_NVCC_FLAGS})
 
 # remove gencode flags added by pytorch
 list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode")
 list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*")
+#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "-gencode")
+#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "arch=compute.*")
+string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS})
 
 
 message("torch nvcc: ${CUDA_NVCC_FLAGS}")
+message("torch cuda_flags: ${CMAKE_CUDA_FLAGS}")
 message("nvcc: ${NVCC_FLAGS}")
 message("punica nvcc: ${PUNICA_NVCC_FLAGS}")
 
@@ -93,7 +96,7 @@ message("punica nvcc: ${PUNICA_NVCC_FLAGS}")
 # https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html
 include(CheckLanguage)
 check_language(HIP)
-#check_language(CUDA)
+#check_language(CUDA)  # picked up by torch
 
 if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND")
   enable_language(HIP)
@@ -185,3 +188,9 @@ endfunction()
 define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
 define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
 define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}")
+
+#get_cmake_property(_variableNames VARIABLES)
+#list (SORT _variableNames)
+#foreach (_variableName ${_variableNames})
+#    message(STATUS "${_variableName}=${${_variableName}}")
+#endforeach()

From e55fc13fcec936bd2299b3fe9310330d9e07f833 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 12 Feb 2024 23:22:45 +0000
Subject: [PATCH 16/76] more shenanigans

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f868de2578521..2f35895b4432d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,9 @@ macro (append_cmake_prefix_path PKG EXPR)
 endmacro()
 
 # add comment why it comes before append_cmake_prefix_path
+set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
 find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
+set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
 find_package(MPI)
 
 #

From 6fd7b599c95b1c892b437554854c63fcdd1cb270 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 12 Feb 2024 23:31:05 +0000
Subject: [PATCH 17/76] try again

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f35895b4432d..3781e3bf7ad9c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,10 @@ macro (append_cmake_prefix_path PKG EXPR)
 endmacro()
 
 # add comment why it comes before append_cmake_prefix_path
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
 find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
 set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
 find_package(MPI)
 

From 7c26517a1c283016c682ce214b40d368d8807edb Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 02:58:40 +0000
Subject: [PATCH 18/76] fiddling around

---
 CMakeLists.txt | 42 ++++++++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3781e3bf7ad9c..ad6246471a5ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,36 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
+# add comment why it comes before append_cmake_prefix_path
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
+find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
+set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
+set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
+find_package(MPI)
+
+if(NOT DEFINED PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+
+# HACKS
+execute_process(
+  COMMAND
+  "${PYTHON_EXECUTABLE}" "--version"
+  OUTPUT_VARIABLE JUNK)
+message("junk: ${JUNK}")
+execute_process(
+  COMMAND
+  "which" "-a" "python3"
+  OUTPUT_VARIABLE JUNK)
+message("junk: ${JUNK}")
+# HACKS
+
+#
+# Find where user site-packages and torch are installed and add it to cmake's search path.
+# Find packages needed to compile
+#
+
 # add comment
 macro (append_cmake_prefix_path PKG EXPR)
   execute_process(
@@ -19,18 +49,6 @@ macro (append_cmake_prefix_path PKG EXPR)
   list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
 endmacro()
 
-# add comment why it comes before append_cmake_prefix_path
-set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
-set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
-find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
-set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
-set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
-find_package(MPI)
-
-#
-# Find where user site-packages and torch are installed and add it to cmake's search path.
-# Find packages needed to compile
-#
 append_cmake_prefix_path("site" "site.getusersitepackages()")
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 

From b0a8531cb8cf19c0a849bc6e23813d60d09563d6 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 03:42:56 +0000
Subject: [PATCH 19/76] add some debugging code

---
 CMakeLists.txt | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad6246471a5ee..d7976269d6924 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,19 +14,6 @@ if(NOT DEFINED PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
 endif()
 
-# HACKS
-execute_process(
-  COMMAND
-  "${PYTHON_EXECUTABLE}" "--version"
-  OUTPUT_VARIABLE JUNK)
-message("junk: ${JUNK}")
-execute_process(
-  COMMAND
-  "which" "-a" "python3"
-  OUTPUT_VARIABLE JUNK)
-message("junk: ${JUNK}")
-# HACKS
-
 #
 # Find where user site-packages and torch are installed and add it to cmake's search path.
 # Find packages needed to compile
@@ -187,8 +174,13 @@ set(CMAKE_CXX_STANDARD 17)
 #this doesn't seem to work
 #set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
+message("pv: ${PYTHON_VERSION_STRING}")
+
+#set(PYTHON_SABI_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+
 # add comment
 function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES)
+  #  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION})
   Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
   #add_library(${MOD_NAME} MODULE ${MOD_SRC})
   set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}")
@@ -211,8 +203,8 @@ define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
 define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
 define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}")
 
-#get_cmake_property(_variableNames VARIABLES)
-#list (SORT _variableNames)
-#foreach (_variableName ${_variableNames})
-#    message(STATUS "${_variableName}=${${_variableName}}")
-#endforeach()
+get_cmake_property(_variableNames VARIABLES)
+list (SORT _variableNames)
+foreach (_variableName ${_variableNames})
+    message(STATUS "${_variableName}=${${_variableName}}")
+endforeach()

From d0622c36f1fe07bdfd6444274b554c1d4f0f089c Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 03:56:22 +0000
Subject: [PATCH 20/76] try exact python version match for debugging

---
 CMakeLists.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7976269d6924..ecd64a6f1c8d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,9 +5,9 @@ project(vllm_extensions LANGUAGES CXX)
 # add comment why it comes before append_cmake_prefix_path
 set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
-find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
-set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
-set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
+find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module)
+#set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
+#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
 find_package(MPI)
 
 if(NOT DEFINED PYTHON_EXECUTABLE)

From 49c0a9c96e67ee4f2061d71bfa61ae8887fc83a2 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 17:13:46 +0000
Subject: [PATCH 21/76] try with more permissive python version(s)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ecd64a6f1c8d3..91c13ea5f2ae4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ project(vllm_extensions LANGUAGES CXX)
 # add comment why it comes before append_cmake_prefix_path
 set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
-find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module)
+find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
 #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
 #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
 find_package(MPI)

From 3cea6a48d79beb68fd26bf1a50885ea33c64ae90 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 17:22:58 +0000
Subject: [PATCH 22/76] add debugging

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91c13ea5f2ae4..0d9a93e4a4451 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.21)
 project(vllm_extensions LANGUAGES CXX)
 
 # add comment why it comes before append_cmake_prefix_path
+set(CMAKE_FIND_DEBUG_MODE TRUE)
 set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
 find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)

From 812fab65da689e8d7da17bf63aa33660f92a37ec Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 17:26:36 +0000
Subject: [PATCH 23/76] add debugging

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d9a93e4a4451..64d74a9e2e170 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,8 @@ project(vllm_extensions LANGUAGES CXX)
 set(CMAKE_FIND_DEBUG_MODE TRUE)
 set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
-find_package(Python 3.8 REQUIRED COMPONENTS Interpreter Development.Module)
+find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
+set(CMAKE_FIND_DEBUG_MODE FALSE)
 #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
 #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
 find_package(MPI)

From 2654e8452eefd0aa5e24f6124e3d708077fa3163 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 17:45:09 +0000
Subject: [PATCH 24/76] try using find_package(Python3...)

---
 CMakeLists.txt | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64d74a9e2e170..213553628efbb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,12 +4,21 @@ project(vllm_extensions LANGUAGES CXX)
 
 # add comment why it comes before append_cmake_prefix_path
 set(CMAKE_FIND_DEBUG_MODE TRUE)
-set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
-set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
-find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
+#set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
+#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
+
+find_package(Python3 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
+
+#find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module)
+#if (NOT Python_FOUND)
+#  find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
+#endif()
+
+
 set(CMAKE_FIND_DEBUG_MODE FALSE)
 #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
 #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
+
 find_package(MPI)
 
 if(NOT DEFINED PYTHON_EXECUTABLE)
@@ -176,14 +185,14 @@ set(CMAKE_CXX_STANDARD 17)
 #this doesn't seem to work
 #set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
-message("pv: ${PYTHON_VERSION_STRING}")
+#message("pv: ${PYTHON_VERSION_STRING}")
 
 #set(PYTHON_SABI_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
 
 # add comment
 function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES)
   #  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION})
-  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+  Python3_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
   #add_library(${MOD_NAME} MODULE ${MOD_SRC})
   set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}")
   # Note: optimization level/debug info is set by build type

From d5b6a2dff887c83435e48718412d87db04c354df Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 17:53:32 +0000
Subject: [PATCH 25/76] try multiple find_package calls for python

---
 CMakeLists.txt | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 213553628efbb..1031a9cf65358 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,13 +7,10 @@ set(CMAKE_FIND_DEBUG_MODE TRUE)
 #set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
 
-find_package(Python3 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
-
-#find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module)
-#if (NOT Python_FOUND)
-#  find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
-#endif()
-
+find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module)
+if (NOT Python_FOUND)
+  find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
+endif()
 
 set(CMAKE_FIND_DEBUG_MODE FALSE)
 #set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
@@ -192,7 +189,7 @@ set(CMAKE_CXX_STANDARD 17)
 # add comment
 function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES)
   #  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION})
-  Python3_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
   #add_library(${MOD_NAME} MODULE ${MOD_SRC})
   set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}")
   # Note: optimization level/debug info is set by build type

From e70ebcef509313de32dc3b88e365016837db68a4 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 13 Feb 2024 17:55:21 +0000
Subject: [PATCH 26/76] try multiple find_packages for python

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1031a9cf65358..5ca1502590011 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(CMAKE_FIND_DEBUG_MODE TRUE)
 #set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
 #set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
 
-find_package(Python 3.8 EXACT REQUIRED COMPONENTS Interpreter Development.Module)
+find_package(Python 3.8 EXACT COMPONENTS Interpreter Development.Module)
 if (NOT Python_FOUND)
   find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
 endif()

From 06f9d55fa88b3f337860c977f3f843be7de38514 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 14 Feb 2024 14:25:55 +0000
Subject: [PATCH 27/76] arch flag parsing in cmake, yay

---
 CMakeLists.txt | 199 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 130 insertions(+), 69 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ca1502590011..eec144244145e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,28 +3,21 @@ cmake_minimum_required(VERSION 3.21)
 project(vllm_extensions LANGUAGES CXX)
 
 # add comment why it comes before append_cmake_prefix_path
-set(CMAKE_FIND_DEBUG_MODE TRUE)
-#set(CMAKE_FIND_PACKAGE_SORT_ORDER NATURAL)
-#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION ASC)
-
+# add comment why we need to do this multiple times (TODO: maybe use a loop?)
 find_package(Python 3.8 EXACT COMPONENTS Interpreter Development.Module)
 if (NOT Python_FOUND)
   find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
 endif()
 
-set(CMAKE_FIND_DEBUG_MODE FALSE)
-#set(CMAKE_FIND_PACKAGE_SORT_ORDER NONE)
-#set(CMAKE_FIND_PACKAGE_SORT_DIRECTION DESC)
-
-find_package(MPI)
-
 if(NOT DEFINED PYTHON_EXECUTABLE)
   set(PYTHON_EXECUTABLE python3)
 endif()
 
+
+find_package(MPI)
+
 #
 # Find where user site-packages and torch are installed and add it to cmake's search path.
-# Find packages needed to compile
 #
 
 # add comment
@@ -51,12 +44,12 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 find_package(Torch 2.1.2 EXACT REQUIRED)
 append_torchlib_if_found(torch_python)
 
-#set(ENV{TORCH_CUDA_ARCH_LIST} "70;75;80;86;89;90")
-
 #
-# Setup NVCC flags
+# Setup extra NVCC flags
 #
 
+# TODO: IS_CUDA only?
+
 # add comment
 execute_process(
     COMMAND
@@ -71,79 +64,147 @@ if(TORCH_NVCC_FLAGS STREQUAL "")
                       " full error message:\n${TORCH_NVCC_FLAGS_ERR}")
 endif()
 
-set(NVCC_FLAGS ${TORCH_NVCC_FLAGS})
+set(VLLM_NVCC_FLAGS ${TORCH_NVCC_FLAGS})
 
 if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-  list(APPEND NVCC_FLAGS "-DENABLE_FP8_E5M2")
+  list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2")
 endif()
 
 if(NVCC_THREADS)
-  list(APPEND NVCC_FLAGS "--threads=${NVCC_THREADS}")
+  list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 
+set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS})
+
 #
 # Copy flags+update for punica
 #
-set(PUNICA_NVCC_FLAGS ${NVCC_FLAGS})
+
 foreach(OPT
     "-D__CUDA_NO_HALF_OPERATORS__"
     "-D__CUDA_NO_HALF_CONVERSIONS__"
     "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
     "-D__CUDA_NO_HALF2_OPERATORS__"
   )
-  list(REMOVE_ITEM PUNICA_NVCC_FLAGS ${OPT})
+  list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS ${OPT})
 endforeach()
 
+#
+# deal with arch flags here
+#
+
+#
+# CUDA_NVCC_FLAGS holds the complete + canonical flags at this point
+# make two versions: regular, punica
+# strip out stuff from punica + update versions
+#
+
+message("CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+
+
 # remove gencode flags added by pytorch
-list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode")
-list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*")
-#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "-gencode")
-#list(FILTER CMAKE_CUDA_FLAGS EXCLUDE REGEX "arch=compute.*")
+#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode")
+#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*")
+#string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS})
+
+
+#
+# Setup arch flags
+#
+
+string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS ${CMAKE_CUDA_FLAGS})
 string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS})
+message("arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
+# filter ARCH_FLAGS and add them back into CMAKE_CUDA_FLAGS
+
+#set(VLLM_PUNICA_CUDA_ARCH_FLAGS ${VLLM_CUDA_ARCH_FLAGS})
+#list(FILTER VLLM_PUNICA_CUDA_ARCH_FLAGS EXCLUDE REGEX "compute_[1-7][0-9]")
 
+message("post arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
+message("post punica arch flags: ${VLLM_PUNICA_CUDA_ARCH_FLAGS}")
+
+message("nvcc: ${VLLM_NVCC_FLAGS}")
+message("punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
+
+#list(APPEND VLLM_NVCC_FLAGS ${VLLM_CUDA_ARCH_FLAGS})
+#list(APPEND VLLM_PUNICA_NVCC_FLAGS ${VLLM_PUNICA_CUDA_ARCH_FLAGS})
+
+# the painful way: NOTE needs to only happen w/CUDA
+set(VLLM_CUDA_ARCHES)
+set(VLLM_PUNICA_CUDA_ARCHES)
+
+macro(string_to_ver VER STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${VER} ${STR})
+endmacro()
+
+foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
+  string(REGEX MATCH "arch=compute_\([0-9]+a*\)" COMPUTE ${ARCH})
+  if (COMPUTE)
+    set(COMPUTE ${CMAKE_MATCH_1})
+#    message("arch: ${COMPUTE}")
+  endif()
+
+  string(REGEX MATCH "code=sm_\([0-9]+a*\)" SM ${ARCH})
+  if (SM)
+    set(SM ${CMAKE_MATCH_1})
+#    message("sm: ${SM}")
+  endif()
+
+  string(REGEX MATCH "code=compute_\([0-9]+a*\)" CODE ${ARCH})
+  if (CODE)
+    set(CODE ${CMAKE_MATCH_1})
+#    message("code: ${CODE}")
+  endif()
+
+  if (COMPUTE AND SM)
+    list(APPEND VLLM_CUDA_ARCHES ${SM})
+    string_to_ver(SM_VER ${SM})
+    if (SM_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_CUDA_ARCHES ${SM})
+    endif()
+  else()
+    list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual")
+    string_to_ver(CODE_VER ${CODE})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE}-virtual")
+    endif()
+  endif()
+endforeach()
+
+message("post nvcc: ${VLLM_NVCC_FLAGS}")
+message("post punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
+message("post nvcc arch: ${VLLM_CUDA_ARCHES}")
+message("post punica arch: ${VLLM_PUNICA_CUDA_ARCHES}")
 
-message("torch nvcc: ${CUDA_NVCC_FLAGS}")
-message("torch cuda_flags: ${CMAKE_CUDA_FLAGS}")
-message("nvcc: ${NVCC_FLAGS}")
-message("punica nvcc: ${PUNICA_NVCC_FLAGS}")
 
 #
 # Check for existence of CUDA/HIP language support
 #
 # https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html
-include(CheckLanguage)
-check_language(HIP)
+#include(CheckLanguage)
+#check_language(HIP)
 #check_language(CUDA)  # picked up by torch
 
-if(NOT CMAKE_HIP_COMPILER STREQUAL "NOTFOUND")
+# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat them.
+
+if(HIP_FOUND)
   enable_language(HIP)
-  list(APPEND NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__")
+  list(APPEND VLLM_NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__")
 
   # TODO: intersect with this list?
   if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
-    set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
+    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
   endif()
 
   foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
-    list(APPEND NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
+    list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
   endforeach()
-elseif(NOT CMAKE_CUDA_COMPILER STREQUAL "NOTFOUND")
+elseif(CUDA_FOUND)
   enable_language(CUDA)
   set(IS_CUDA true)
 
-  # TODO: parse TORCH_CUDA_ARCH_LIST -> CMAKE_CUDA_ARCHITECTURES?
-  # cmake env var CUDAARCHS
-
-  # https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html#prop_tgt:CUDA_ARCHITECTURES
-  # set_target_properties(tgt PROPERTIES CUDA_ARCHITECTURES "35;50;72")
-  # TODO: PTX stuff
-#  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-#    # This indicates support for both real architectures (i.e, no ptx).
-#    set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
-#  endif()
-
-  set(VLLM_CUDA_ARCHES "70;75;80;86;89;90")
-  set(VLLM_PUNICA_CUDA_ARCHES "80;86;89;90")  # >8.0 of VLLM_CUDA_ARCHITECTURES
+  # TODO: check supported?
 
 else()
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@@ -171,6 +232,7 @@ if(IS_CUDA)
     "csrc/custom_all_reduce.cu")
 endif()
 
+#TODO: list files
 File(GLOB VLLM_MOE_EXT_SRC "csrc/moe/*.cu" "csrc/moe/*.cpp")
 File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp")
 
@@ -179,40 +241,39 @@ File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp")
 #
 set(CMAKE_CXX_STANDARD 17)
 
-#this doesn't seem to work
-#set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-#message("pv: ${PYTHON_VERSION_STRING}")
-
-#set(PYTHON_SABI_VERSION "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
-
 # add comment
-function(define_module_target MOD_NAME MOD_SRC MOD_NVCC_FLAGS MOD_CUDA_ARCHES)
-  #  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} USE_SABI ${PYTHON_SABI_VERSION})
+function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS MOD_CUDA_ARCHES)
   Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
-  #add_library(${MOD_NAME} MODULE ${MOD_SRC})
+
   set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}")
-  # Note: optimization level/debug info is set by build type
+
   if (IS_CUDA)
     set(CUDA_LANG "CUDA")
-#    target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${MOD_NVCC_FLAGS}>)
   else()
     set(CUDA_LANG "HIP")
-#    target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:${MOD_NVCC_FLAGS}>)
   endif()
-  target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_NVCC_FLAGS}>)
+
+  # Note: optimization level/debug info is set by build type
+  #  target_compile_options(${MOD_NAME} BEFORE PRIVATE $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_EXTRA_NVCC_FLAGS}>)
+  target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_EXTRA_NVCC_FLAGS}>)
+
+#  get_target_property(XXX ${MOD_NAME} COMPILE_OPTIONS)
+#  message("XXX: ${XXX}")
+#  get_target_property(XXX ${MOD_NAME} COMPILE_FEATURES)
+#  message("XXX: ${XXX}")
+
   target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
   target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS})
   target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
   install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
 endfunction()
 
-define_module_target(_C "${VLLM_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
-define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
-define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}")
+define_module_target(_C "${VLLM_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
+define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
+define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}")
 
-get_cmake_property(_variableNames VARIABLES)
-list (SORT _variableNames)
-foreach (_variableName ${_variableNames})
-    message(STATUS "${_variableName}=${${_variableName}}")
-endforeach()
+#get_cmake_property(_variableNames VARIABLES)
+#list (SORT _variableNames)
+#foreach (_variableName ${_variableNames})
+#    message(STATUS "${_variableName}=${${_variableName}}")
+#endforeach()

From ef12d5dfe5ea408197f1e962c251299963483e29 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 14 Feb 2024 14:50:50 +0000
Subject: [PATCH 28/76] filter out unsupported arches

---
 CMakeLists.txt | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eec144244145e..edd6324c6dbb0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,9 @@ macro(string_to_ver VER STR)
   string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${VER} ${STR})
 endmacro()
 
+set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100")
+
 foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
   string(REGEX MATCH "arch=compute_\([0-9]+a*\)" COMPUTE ${ARCH})
   if (COMPUTE)
@@ -158,14 +161,20 @@ foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
   endif()
 
   if (COMPUTE AND SM)
-    list(APPEND VLLM_CUDA_ARCHES ${SM})
     string_to_ver(SM_VER ${SM})
+    if (NOT SM_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
+      continue()
+    endif()
+    list(APPEND VLLM_CUDA_ARCHES ${SM})
     if (SM_VER GREATER_EQUAL 8.0)
       list(APPEND VLLM_PUNICA_CUDA_ARCHES ${SM})
     endif()
   else()
-    list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual")
     string_to_ver(CODE_VER ${CODE})
+    if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
+      continue()
+    endif()
+    list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual")
     if (CODE_VER GREATER_EQUAL 8.0)
       list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE}-virtual")
     endif()

From 4d748676bd69dbcce15cdb9b209f1ae794314270 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 14 Feb 2024 14:51:11 +0000
Subject: [PATCH 29/76] filter out unsupported arches

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index edd6324c6dbb0..b27ee18d62e88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -163,6 +163,7 @@ foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
   if (COMPUTE AND SM)
     string_to_ver(SM_VER ${SM})
     if (NOT SM_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
+      # TODO: issue warning?
       continue()
     endif()
     list(APPEND VLLM_CUDA_ARCHES ${SM})
@@ -172,6 +173,7 @@ foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
   else()
     string_to_ver(CODE_VER ${CODE})
     if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
+      # TODO: issue warning?
       continue()
     endif()
     list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual")

From 9ee738d0b0eeb94b7584f01c1fed57755b1a9cd1 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 14 Feb 2024 20:27:11 +0000
Subject: [PATCH 30/76] cleanups + add comments

---
 CMakeLists.txt | 423 +++++++++++++++++++++++++++++--------------------
 1 file changed, 248 insertions(+), 175 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b27ee18d62e88..18aadcd28d12a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,69 +2,126 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-# add comment why it comes before append_cmake_prefix_path
-# add comment why we need to do this multiple times (TODO: maybe use a loop?)
-find_package(Python 3.8 EXACT COMPONENTS Interpreter Development.Module)
-if (NOT Python_FOUND)
-  find_package(Python 3.8...3.11 REQUIRED COMPONENTS Interpreter Development.Module)
-endif()
-
-if(NOT DEFINED PYTHON_EXECUTABLE)
-  set(PYTHON_EXECUTABLE python3)
-endif()
+#
+# Supported python verions.  These versions will be searched in order, the
+# first match will be selected.
+#
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 
+# Supported NVIDIA architectures
+set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
-find_package(MPI)
+# Supported AMD GPU architectures
+set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100")
 
 #
-# Find where user site-packages and torch are installed and add it to cmake's search path.
+# Loop thru all supported python versions until we find the first match.
+# Cmake is unable to pick the lowest supported version when multiple
+# versions are available, even with CMAKE_FIND_PACKAGE_SORT_ORDER.
 #
+foreach(VER ${PYTHON_SUPPORTED_VERSIONS})
+  message(STATUS "Attempting to find python ${VER} package.")
+  find_package(Python ${VER} EXACT COMPONENTS Interpreter Development.Module)
+  if (Python_FOUND)
+    message(STATUS "Found python version ${VER} (${Python_EXECUTABLE}).")
+    break()
+  endif()
+endforeach()
 
-# add comment
-macro (append_cmake_prefix_path PKG EXPR)
+if (NOT Python_FOUND)
+  message(FATAL_ERROR
+    "No supported version of python found. ('${PYTHON_SUPPORTED_VERSIONS}')")
+endif()
+
+#
+# Run EXPR in python.  The standard output of python is stored in OUT and has
+# trailing whitespace stripped.  If an error is encountered when running python,
+# a fatal message ERR_MSG is issued.
+#
+macro (run_python OUT EXPR ERR_MSG)
   execute_process(
     COMMAND
-    "${PYTHON_EXECUTABLE}" "-c" "import ${PKG}; print(${EXPR})"
-    OUTPUT_VARIABLE PREFIX_PATH
-    ERROR_VARIABLE PREFIX_PATH_ERR
+    "${Python_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE ${OUT}
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
     OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-  if(PREFIX_PATH STREQUAL "")
-    message(FATAL_ERROR "Failed to locate ${PKG} path,"
-            " full error message:\n${PREFIX_PATH_ERR}")
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
   endif()
+endmacro()
+
+#
+# Try to find MPI package
+#
+find_package(MPI)
 
+#
+# Find where user site-packages and torch are installed and add it to cmake's
+# search path.
+#
+
+# Run EXPR in python after importing PKG. Use the result of this to extend
+# CMAKE_PREFIX_PATH so we can import the torch cmake configuration.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
   list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
 endmacro()
 
+# Add user site-packages and torch path to CMAKE_PREFIX_PATH
 append_cmake_prefix_path("site" "site.getusersitepackages()")
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
-
+#
+# Import torch cmake configuration.
+# Torch also imports CUDA/HIP packages with some customizations, so we do not
+# need to do this explicitly with check_language/enable_language, etc.
+#
 find_package(Torch 2.1.2 EXACT REQUIRED)
+
+# For some reason torch does not add libtorch_python.so to the list of torch
+# libraries to link.  Find it by hand using 'append_torchlib_if_found' from
+# torch's cmake setup.
 append_torchlib_if_found(torch_python)
 
+if ((NOT HIP_FOUND) AND (NOT CUDA_FOUND))
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+endif()
+
+if (NOT HIP_FOUND AND CUDA_FOUND)
+  set(IS_CUDA true)
+endif()
+
 #
 # Setup extra NVCC flags
 #
+# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat
+# detect them explicitly with check_language, etc.
+#
+if (HIP_FOUND)
+  list(APPEND VLLM_NVCC_FLAGS
+    "-DUSE_ROCM"
+    "-U__HIP_NO_HALF_CONVERSIONS__"
+    "-U__HIP_NO_HALF_OPERATORS__")
 
-# TODO: IS_CUDA only?
-
-# add comment
-execute_process(
-    COMMAND
-      "${PYTHON_EXECUTABLE}" "-c"
-      "import torch.utils.cpp_extension as torch_cpp_ext; print(';'.join(torch_cpp_ext.COMMON_NVCC_FLAGS))"
-    OUTPUT_VARIABLE TORCH_NVCC_FLAGS
-    ERROR_VARIABLE TORCH_NVCC_FLAGS_ERR
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  # TODO: intersect with this list?
+  if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
+    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
+  endif()
 
-if(TORCH_NVCC_FLAGS STREQUAL "")
-  message(FATAL_ERROR "Unable to determine torch nvcc compiler flags,"
-                      " full error message:\n${TORCH_NVCC_FLAGS_ERR}")
+  foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
+    list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
+  endforeach()
 endif()
 
-set(VLLM_NVCC_FLAGS ${TORCH_NVCC_FLAGS})
+# TODO: IS_CUDA only?
+
+# Get common NVCC flags from torch.
+run_python(VLLM_NVCC_FLAGS
+  "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+  "Failed to determine torch nvcc compiler flags")
 
 if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
   list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2")
@@ -80,149 +137,163 @@ set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS})
 # Copy flags+update for punica
 #
 
-foreach(OPT
-    "-D__CUDA_NO_HALF_OPERATORS__"
-    "-D__CUDA_NO_HALF_CONVERSIONS__"
-    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
-    "-D__CUDA_NO_HALF2_OPERATORS__"
-  )
-  list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS ${OPT})
-endforeach()
+list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
 
-#
-# deal with arch flags here
-#
+message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}")
+message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
 
 #
-# CUDA_NVCC_FLAGS holds the complete + canonical flags at this point
-# make two versions: regular, punica
-# strip out stuff from punica + update versions
+# Setup/process CUDA arch flags
 #
-
-message("CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
-message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-
-
-# remove gencode flags added by pytorch
-#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-gencode")
-#list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "arch=compute.*")
-#string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS})
-
-
+# The torch cmake setup detects and hardcodes the detected architecture flags
+# in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported
+# architectures and the punica target.  So we have to extract and remove all
+# the '-gencode' flags from CMAKE_CUDA_FLAGS for processing.  We can't use
+# 'target_compiler_options' for adding '-gencode' arguments so we will use the
+# target's CUDA_ARCHITECTURES property instead. This requires repackaging
+# the architecture flags into a format that cmake expects for
+# CUDA_ARCHITECTURES.
 #
-# Setup arch flags
+# This is a bit fragile in that it depends on torch using -gencode as opposed
+# to one of the other nvcc options to specify architectures.
 #
+# Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override
+# detected architectures.
+#
+message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+
+# Extract all '-gencode' flags from CMAKE_CUDA_FLAGS
+string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS
+  ${CMAKE_CUDA_FLAGS})
+
+# Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying
+# them and passing them back in via the CUDA_ARCHITECTURES property.
+string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+  ${CMAKE_CUDA_FLAGS})
+
+# If this error is triggered, it might mean that torch has changed how it sets
+# up nvcc architecture code generation flags.
+if (NOT VLLM_CUDA_ARCH_FLAGS)
+  message(FATAL_ERROR
+    "Could not find any architecture related code generation flags in "
+    "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
+endif()
 
-string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS ${CMAKE_CUDA_FLAGS})
-string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS})
-message("arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
-# filter ARCH_FLAGS and add them back into CMAKE_CUDA_FLAGS
-
-#set(VLLM_PUNICA_CUDA_ARCH_FLAGS ${VLLM_CUDA_ARCH_FLAGS})
-#list(FILTER VLLM_PUNICA_CUDA_ARCH_FLAGS EXCLUDE REGEX "compute_[1-7][0-9]")
-
-message("post arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
-message("post punica arch flags: ${VLLM_PUNICA_CUDA_ARCH_FLAGS}")
-
-message("nvcc: ${VLLM_NVCC_FLAGS}")
-message("punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
+message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
 
-#list(APPEND VLLM_NVCC_FLAGS ${VLLM_CUDA_ARCH_FLAGS})
-#list(APPEND VLLM_PUNICA_NVCC_FLAGS ${VLLM_PUNICA_CUDA_ARCH_FLAGS})
+# Macro for converting a 'gencode' version number to a cmake version number.
+macro(string_to_ver OUT_VER IN_STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+endmacro()
 
-# the painful way: NOTE needs to only happen w/CUDA
+# Initialize the architecure lists to empty.
 set(VLLM_CUDA_ARCHES)
 set(VLLM_PUNICA_CUDA_ARCHES)
 
-macro(string_to_ver VER STR)
-  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${VER} ${STR})
-endmacro()
-
-set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
-set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100")
-
+# Process each 'gencode' flag.
 foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
-  string(REGEX MATCH "arch=compute_\([0-9]+a*\)" COMPUTE ${ARCH})
+  # For each flag we want to extract the version number and whether
+  # it refers to PTX or native code.
+  # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding
+  # for that match.
+
+  string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
   if (COMPUTE)
     set(COMPUTE ${CMAKE_MATCH_1})
-#    message("arch: ${COMPUTE}")
   endif()
 
-  string(REGEX MATCH "code=sm_\([0-9]+a*\)" SM ${ARCH})
+  string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH})
   if (SM)
     set(SM ${CMAKE_MATCH_1})
-#    message("sm: ${SM}")
   endif()
 
-  string(REGEX MATCH "code=compute_\([0-9]+a*\)" CODE ${ARCH})
+  string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH})
   if (CODE)
     set(CODE ${CMAKE_MATCH_1})
-#    message("code: ${CODE}")
   endif()
 
-  if (COMPUTE AND SM)
-    string_to_ver(SM_VER ${SM})
-    if (NOT SM_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
-      # TODO: issue warning?
-      continue()
-    endif()
-    list(APPEND VLLM_CUDA_ARCHES ${SM})
-    if (SM_VER GREATER_EQUAL 8.0)
-      list(APPEND VLLM_PUNICA_CUDA_ARCHES ${SM})
-    endif()
+  # Make sure the virtual architecture can be matched.
+  if (NOT COMPUTE)
+    message(FATAL_ERROR
+      "Could not determine virtual architecture from: ${ARCH}.")
+  endif()
+
+  # One of sm_ or compute_ must exist.
+  if ((NOT SM) AND (NOT CODE))
+    message(FATAL_ERROR
+      "Could not determine a codegen architecture from: ${ARCH}.")
+  endif()
+
+  if (SM)
+    set(VIRT "")
+    set(CODE_ARCH ${SM})
   else()
-    string_to_ver(CODE_VER ${CODE})
-    if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
-      # TODO: issue warning?
-      continue()
-    endif()
-    list(APPEND VLLM_CUDA_ARCHES "${CODE}-virtual")
-    if (CODE_VER GREATER_EQUAL 8.0)
-      list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE}-virtual")
-    endif()
+    set(VIRT "-virtual")
+    set(CODE_ARCH ${CODE})
   endif()
-endforeach()
 
-message("post nvcc: ${VLLM_NVCC_FLAGS}")
-message("post punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
-message("post nvcc arch: ${VLLM_CUDA_ARCHES}")
-message("post punica arch: ${VLLM_PUNICA_CUDA_ARCHES}")
+  # Check if the current version is in the supported arch list
+  string_to_ver(CODE_VER ${CODE_ARCH})
+  if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
+    message(STATUS "discarding unsupported CUDA arch ${VER}.")
+    continue()
+  endif()
+
+  # Add it to the arch list
+  list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
+
+  # Add it to punica arch list if the version is >= 8.0
+  if (CODE_VER GREATER_EQUAL 8.0)
+    list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
+  endif()
+endforeach()
 
+message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}")
+message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}")
 
 #
-# Check for existence of CUDA/HIP language support
+# Define targets
 #
-# https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html
-#include(CheckLanguage)
-#check_language(HIP)
-#check_language(CUDA)  # picked up by torch
 
-# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat them.
+# add comment
+# Note: optimization level/debug info is set via cmake build type.
+function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS
+    MOD_CUDA_ARCHES)
 
-if(HIP_FOUND)
-  enable_language(HIP)
-  list(APPEND VLLM_NVCC_FLAGS "-DUSE_ROCM" "-U__HIP_NO_HALF_CONVERSIONS__" "-U__HIP_NO_HALF_OPERATORS__")
+  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
 
-  # TODO: intersect with this list?
-  if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
-    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
+  # TODO: needed for rocm?
+  if (IS_CUDA)
+    set(CUDA_LANG "CUDA")
+  else()
+    set(CUDA_LANG "HIP")
   endif()
 
-  foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
-    list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
-  endforeach()
-elseif(CUDA_FOUND)
-  enable_language(CUDA)
-  set(IS_CUDA true)
+  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
 
-  # TODO: check supported?
+  set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES
+    "${MOD_CUDA_ARCHES}")
 
-else()
-  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
-endif()
+  target_compile_options(${MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_EXTRA_NVCC_FLAGS}>)
+
+  target_compile_definitions(${MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
+
+  target_include_directories(${MOD_NAME} PRIVATE
+    csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS})
+
+  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+
+  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
+endfunction()
 
 #
-# Define target source files
+# _C extension
 #
 
 set(VLLM_EXT_SRC
@@ -243,48 +314,50 @@ if(IS_CUDA)
     "csrc/custom_all_reduce.cu")
 endif()
 
-#TODO: list files
-File(GLOB VLLM_MOE_EXT_SRC "csrc/moe/*.cu" "csrc/moe/*.cpp")
-File(GLOB VLLM_PUNICA_EXT_SRC "csrc/punica/bgmv/*.cu" "csrc/punica/*.cpp")
+define_module_target(_C
+  "${VLLM_EXT_SRC}"
+  "${VLLM_NVCC_FLAGS}"
+  "${VLLM_CUDA_ARCHES}")
 
 #
-# Define targets
+# _moe_C extension
 #
-set(CMAKE_CXX_STANDARD 17)
-
-# add comment
-function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS MOD_CUDA_ARCHES)
-  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
 
-  set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES "${MOD_CUDA_ARCHES}")
-
-  if (IS_CUDA)
-    set(CUDA_LANG "CUDA")
-  else()
-    set(CUDA_LANG "HIP")
-  endif()
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/moe_ops.cpp"
+  "csrc/moe/topk_softmax_kernels.cu")
 
-  # Note: optimization level/debug info is set by build type
-  #  target_compile_options(${MOD_NAME} BEFORE PRIVATE $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_EXTRA_NVCC_FLAGS}>)
-  target_compile_options(${MOD_NAME} PRIVATE $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_EXTRA_NVCC_FLAGS}>)
+define_module_target(_moe_C
+  "${VLLM_MOE_EXT_SRC}"
+  "${VLLM_NVCC_FLAGS}"
+  "${VLLM_CUDA_ARCHES}")
 
-#  get_target_property(XXX ${MOD_NAME} COMPILE_OPTIONS)
-#  message("XXX: ${XXX}")
-#  get_target_property(XXX ${MOD_NAME} COMPILE_FEATURES)
-#  message("XXX: ${XXX}")
-
-  target_compile_definitions(${MOD_NAME} PRIVATE "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
-  target_include_directories(${MOD_NAME} PRIVATE csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS})
-  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
-  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
-endfunction()
-
-define_module_target(_C "${VLLM_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
-define_module_target(_moe_C "${VLLM_MOE_EXT_SRC}" "${VLLM_NVCC_FLAGS}" "${VLLM_CUDA_ARCHES}")
-define_module_target(_punica_C "${VLLM_PUNICA_EXT_SRC}" "${VLLM_PUNICA_NVCC_FLAGS}" "${VLLM_PUNICA_CUDA_ARCHES}")
+#
+# _punica_C extension
+#
 
-#get_cmake_property(_variableNames VARIABLES)
-#list (SORT _variableNames)
-#foreach (_variableName ${_variableNames})
-#    message(STATUS "${_variableName}=${${_variableName}}")
-#endforeach()
+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
+  "csrc/punica/punica_ops.cc")
+
+define_module_target(_punica_C
+  "${VLLM_PUNICA_EXT_SRC}"
+  "${VLLM_PUNICA_NVCC_FLAGS}"
+  "${VLLM_PUNICA_CUDA_ARCHES}")

From 1e08118ecd5cef52f60e1c1e2af1ea06169fdd92 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 19 Feb 2024 20:51:09 -0500
Subject: [PATCH 31/76] wip

---
 CMakeLists.txt | 404 +++++++++++++++++++++++++++++++------------------
 Dockerfile     |   1 +
 hipify.py      | 111 ++++++++++++++
 setup.py       |   6 +-
 4 files changed, 373 insertions(+), 149 deletions(-)
 create mode 100755 hipify.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18aadcd28d12a..666d213036995 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,16 +15,27 @@ set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100")
 
 #
-# Loop thru all supported python versions until we find the first match.
+# Loop thru all supported python versions until we find the first suitable
+# version that has torch installed.
+#
 # Cmake is unable to pick the lowest supported version when multiple
 # versions are available, even with CMAKE_FIND_PACKAGE_SORT_ORDER.
 #
 foreach(VER ${PYTHON_SUPPORTED_VERSIONS})
   message(STATUS "Attempting to find python ${VER} package.")
-  find_package(Python ${VER} EXACT COMPONENTS Interpreter Development.Module)
+  find_package(Python ${VER} COMPONENTS Interpreter Development.Module)
   if (Python_FOUND)
-    message(STATUS "Found python version ${VER} (${Python_EXECUTABLE}).")
-    break()
+    execute_process(
+      COMMAND
+      "${Python_EXECUTABLE}" "-c" "import torch"
+      OUTPUT_VARIABLE PYTHON_OUT
+      RESULT_VARIABLE PYTHON_ERROR_CODE
+      ERROR_VARIABLE PYTHON_STDERR)
+
+    if(PYTHON_ERROR_CODE EQUAL 0)
+      message(STATUS "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).")
+      break()
+    endif()
   endif()
 endforeach()
 
@@ -57,6 +68,9 @@ endmacro()
 #
 find_package(MPI)
 
+#find_package(HIP)
+enable_language(HIP)  # use FindHIP?
+
 #
 # Find where user site-packages and torch are installed and add it to cmake's
 # search path.
@@ -79,7 +93,10 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 # Torch also imports CUDA/HIP packages with some customizations, so we do not
 # need to do this explicitly with check_language/enable_language, etc.
 #
-find_package(Torch 2.1.2 EXACT REQUIRED)
+#find_package(Torch 2.1.2 EXACT REQUIRED)
+find_package(Torch REQUIRED)
+
+# TODO: warn about wrong version
 
 # For some reason torch does not add libtorch_python.so to the list of torch
 # libraries to link.  Find it by hand using 'append_torchlib_if_found' from
@@ -101,159 +118,183 @@ endif()
 # detect them explicitly with check_language, etc.
 #
 if (HIP_FOUND)
-  list(APPEND VLLM_NVCC_FLAGS
-    "-DUSE_ROCM"
-    "-U__HIP_NO_HALF_CONVERSIONS__"
-    "-U__HIP_NO_HALF_OPERATORS__")
+  message("HIP FOUND")
+#  enable_language(HIP)  # use FindHIP?
 
   # TODO: intersect with this list?
   if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
-    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
+    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}")
+  else()
+    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
   endif()
+  set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}")
 
-  foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
-    list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
-  endforeach()
-endif()
-
-# TODO: IS_CUDA only?
-
-# Get common NVCC flags from torch.
-run_python(VLLM_NVCC_FLAGS
-  "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
-  "Failed to determine torch nvcc compiler flags")
-
-if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-  list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2")
-endif()
+  #  foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
+  #    list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
+  #  endforeach()
 
-if(NVCC_THREADS)
-  list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}")
-endif()
+  set(VLLM_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES})
+  set(VLLM_PUNICA_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES})
 
-set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS})
+  # Get common NVCC flags from torch.
+  run_python(VLLM_NVCC_FLAGS
+    "from torch.utils.cpp_extension import COMMON_HIP_FLAGS; print(';'.join(COMMON_HIP_FLAGS))"
+    "Failed to determine torch nvcc compiler flags")
 
-#
-# Copy flags+update for punica
-#
-
-list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS
-  "-D__CUDA_NO_HALF_OPERATORS__"
-  "-D__CUDA_NO_HALF_CONVERSIONS__"
-  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
-  "-D__CUDA_NO_HALF2_OPERATORS__")
-
-message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}")
-message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
-
-#
-# Setup/process CUDA arch flags
-#
-# The torch cmake setup detects and hardcodes the detected architecture flags
-# in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported
-# architectures and the punica target.  So we have to extract and remove all
-# the '-gencode' flags from CMAKE_CUDA_FLAGS for processing.  We can't use
-# 'target_compiler_options' for adding '-gencode' arguments so we will use the
-# target's CUDA_ARCHITECTURES property instead. This requires repackaging
-# the architecture flags into a format that cmake expects for
-# CUDA_ARCHITECTURES.
-#
-# This is a bit fragile in that it depends on torch using -gencode as opposed
-# to one of the other nvcc options to specify architectures.
-#
-# Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override
-# detected architectures.
-#
-message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-
-# Extract all '-gencode' flags from CMAKE_CUDA_FLAGS
-string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS
-  ${CMAKE_CUDA_FLAGS})
-
-# Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying
-# them and passing them back in via the CUDA_ARCHITECTURES property.
-string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
-  ${CMAKE_CUDA_FLAGS})
-
-# If this error is triggered, it might mean that torch has changed how it sets
-# up nvcc architecture code generation flags.
-if (NOT VLLM_CUDA_ARCH_FLAGS)
-  message(FATAL_ERROR
-    "Could not find any architecture related code generation flags in "
-    "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
-endif()
-
-message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
-
-# Macro for converting a 'gencode' version number to a cmake version number.
-macro(string_to_ver OUT_VER IN_STR)
-  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
-endmacro()
+  run_python(X
+    "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))"
+    "Failed to determine torch nvcc compiler flags")
 
-# Initialize the architecure lists to empty.
-set(VLLM_CUDA_ARCHES)
-set(VLLM_PUNICA_CUDA_ARCHES)
+  list(APPEND VLLM_NVCC_FLAGS "${X}")
 
-# Process each 'gencode' flag.
-foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
-  # For each flag we want to extract the version number and whether
-  # it refers to PTX or native code.
-  # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding
-  # for that match.
+  list(APPEND VLLM_NVCC_FLAGS
+    "-DWITH_HIP"
+    "-DUSE_ROCM"
+    "-U__HIP_NO_HALF_CONVERSIONS__"
+    "-U__HIP_NO_HALF_OPERATORS__")
 
-  string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
-  if (COMPUTE)
-    set(COMPUTE ${CMAKE_MATCH_1})
-  endif()
+  # hack
+#  set(CMAKE_CUDA_COMPILER ${hip_HIPCC_EXECUTABLE}) # ${ROCM_PATH}/bin/hipcc)
+# enable_language(CUDA)  # use FindHIP?
 
-  string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH})
-  if (SM)
-    set(SM ${CMAKE_MATCH_1})
-  endif()
+else()
+  # Get common NVCC flags from torch.
+  run_python(VLLM_NVCC_FLAGS
+    "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+    "Failed to determine torch nvcc compiler flags")
 
-  string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH})
-  if (CODE)
-    set(CODE ${CMAKE_MATCH_1})
+  if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+    list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2")
   endif()
 
-  # Make sure the virtual architecture can be matched.
-  if (NOT COMPUTE)
-    message(FATAL_ERROR
-      "Could not determine virtual architecture from: ${ARCH}.")
+  if(NVCC_THREADS)
+    list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}")
   endif()
 
-  # One of sm_ or compute_ must exist.
-  if ((NOT SM) AND (NOT CODE))
+  set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS})
+
+  #
+  # Copy flags+update for punica
+  #
+
+  list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS
+    "-D__CUDA_NO_HALF_OPERATORS__"
+    "-D__CUDA_NO_HALF_CONVERSIONS__"
+    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+    "-D__CUDA_NO_HALF2_OPERATORS__")
+
+  message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}")
+  message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
+
+  #
+  # Setup/process CUDA arch flags
+  #
+  # The torch cmake setup detects and hardcodes the detected architecture flags
+  # in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported
+  # architectures and the punica target.  So we have to extract and remove all
+  # the '-gencode' flags from CMAKE_CUDA_FLAGS for processing.  We can't use
+  # 'target_compiler_options' for adding '-gencode' arguments so we will use the
+  # target's CUDA_ARCHITECTURES property instead. This requires repackaging
+  # the architecture flags into a format that cmake expects for
+  # CUDA_ARCHITECTURES.
+  #
+  # This is a bit fragile in that it depends on torch using -gencode as opposed
+  # to one of the other nvcc options to specify architectures.
+  #
+  # Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override
+  # detected architectures.
+  #
+  message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+
+  # Extract all '-gencode' flags from CMAKE_CUDA_FLAGS
+  string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS
+    ${CMAKE_CUDA_FLAGS})
+
+  # Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying
+  # them and passing them back in via the CUDA_ARCHITECTURES property.
+  string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+    ${CMAKE_CUDA_FLAGS})
+
+  # If this error is triggered, it might mean that torch has changed how it sets
+  # up nvcc architecture code generation flags.
+  if (NOT VLLM_CUDA_ARCH_FLAGS)
     message(FATAL_ERROR
-      "Could not determine a codegen architecture from: ${ARCH}.")
-  endif()
-
-  if (SM)
-    set(VIRT "")
-    set(CODE_ARCH ${SM})
-  else()
-    set(VIRT "-virtual")
-    set(CODE_ARCH ${CODE})
-  endif()
-
-  # Check if the current version is in the supported arch list
-  string_to_ver(CODE_VER ${CODE_ARCH})
-  if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
-    message(STATUS "discarding unsupported CUDA arch ${VER}.")
-    continue()
+      "Could not find any architecture related code generation flags in "
+      "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
   endif()
 
-  # Add it to the arch list
-  list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
-
-  # Add it to punica arch list if the version is >= 8.0
-  if (CODE_VER GREATER_EQUAL 8.0)
-    list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
-  endif()
-endforeach()
+  message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
+
+  # Macro for converting a 'gencode' version number to a cmake version number.
+  macro(string_to_ver OUT_VER IN_STR)
+    string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+  endmacro()
+
+  # Initialize the architecure lists to empty.
+  set(VLLM_CUDA_ARCHES)
+  set(VLLM_PUNICA_CUDA_ARCHES)
+
+  # Process each 'gencode' flag.
+  foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
+    # For each flag we want to extract the version number and whether
+    # it refers to PTX or native code.
+    # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding
+    # for that match.
+
+    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
+    if (COMPUTE)
+      set(COMPUTE ${CMAKE_MATCH_1})
+    endif()
+
+    string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH})
+    if (SM)
+      set(SM ${CMAKE_MATCH_1})
+    endif()
+
+    string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH})
+    if (CODE)
+      set(CODE ${CMAKE_MATCH_1})
+    endif()
+
+    # Make sure the virtual architecture can be matched.
+    if (NOT COMPUTE)
+      message(FATAL_ERROR
+        "Could not determine virtual architecture from: ${ARCH}.")
+    endif()
+
+    # One of sm_ or compute_ must exist.
+    if ((NOT SM) AND (NOT CODE))
+      message(FATAL_ERROR
+        "Could not determine a codegen architecture from: ${ARCH}.")
+    endif()
+
+    if (SM)
+      set(VIRT "")
+      set(CODE_ARCH ${SM})
+    else()
+      set(VIRT "-virtual")
+      set(CODE_ARCH ${CODE})
+    endif()
+
+    # Check if the current version is in the supported arch list
+    string_to_ver(CODE_VER ${CODE_ARCH})
+    if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
+      message(STATUS "discarding unsupported CUDA arch ${VER}.")
+      continue()
+    endif()
+
+    # Add it to the arch list
+    list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
+
+    # Add it to punica arch list if the version is >= 8.0
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
+    endif()
+  endforeach()
 
-message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}")
-message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}")
+  message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}")
+  message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}")
+endif()
 
 #
 # Define targets
@@ -264,19 +305,37 @@ message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}")
 function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS
     MOD_CUDA_ARCHES)
 
-  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
-
   # TODO: needed for rocm?
   if (IS_CUDA)
+    Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+
     set(CUDA_LANG "CUDA")
+    set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES
+      "${MOD_CUDA_ARCHES}")
+
   else()
+    Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+#    hip_add_library(${MOD_NAME} MODULE ${MOD_SRC})
+
     set(CUDA_LANG "HIP")
+    foreach(SRC ${MOD_SRC})
+      if (${SRC} MATCHES "\.hip$")
+        message("setting HIP on ${SRC}")
+        set_source_files_properties(${SRC} PROPERTIES LANGUAGE HIP)
+        #set_source_files_properties(${SRC} PROPERTIES LANGUAGE CUDA)
+        #set_source_files_properties(${SRC} PROPERTIES LANGUAGE ${CUDA_LANG})
+      endif()
+    endforeach()
+
+    message("got here! ${MOD_CUDA_ARCHES}")
+    set_target_properties(${MOD_NAME} PROPERTIES HIP_ARCHITECTURES
+      "${MOD_CUDA_ARCHES}")
+
   endif()
 
   set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
 
-  set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES
-    "${MOD_CUDA_ARCHES}")
+#  set_target_properties(${MOD_NAME} PROPERTIES LINKER_LANGUAGE CXX)
 
   target_compile_options(${MOD_NAME} PRIVATE
     $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_EXTRA_NVCC_FLAGS}>)
@@ -285,9 +344,17 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS
     "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
 
   target_include_directories(${MOD_NAME} PRIVATE
-    csrc PRIVATE ${TORCH_INCLUDE_DIRS} ${MPI_CXX_INCLUDE_DIRS})
+    csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS})
 
-  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+
+  if (TRUE OR IS_CUDA)
+    target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  else()
+    # -- Python_SOABI=cpython-39-x86_64-linux-gnu
+    message("got here ${_PYTHON_INCLUDE_DIR}, ${_PYTHON_LIBRARY}")
+    target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES} ${_Python_LIBRARY})
+    target_include_directories(${MOD_NAME} PRIVATE ${Python_INCLUDE_DIRS})
+  endif()
 
   install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
 endfunction()
@@ -305,15 +372,56 @@ set(VLLM_EXT_SRC
   "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/cuda_utils_kernels.cu"
-  "csrc/moe_align_block_size_kernels.cu"
-  "csrc/pybind.cpp")
+  "csrc/moe_align_block_size_kernels.cu")
+
+set(VLLM_EXT_ROCM_SRC
+  "csrc/cache_kernels.hip"
+  "csrc/attention/attention_kernels.hip"
+  "csrc/pos_encoding_kernels.hip"
+  "csrc/activation_kernels.hip"
+  "csrc/layernorm_kernels.hip"
+  "csrc/quantization/squeezellm/quant_hip_kernel.hip"
+  "csrc/quantization/gptq/q_gemm.hip"
+  "csrc/hip_utils_kernels.hip"
+  "csrc/moe_align_block_size_kernels.hip")
 
 if(IS_CUDA)
   list(APPEND VLLM_EXT_SRC
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu")
+elseif(HIP_FOUND)
+  # maybe use add_custom_target instead + add_dependencies?
+  message("build dir  ${CMAKE_CURRENT_BINARY_DIR}")
+
+  set(X)
+  foreach (SRC ${VLLM_EXT_SRC})
+    list(APPEND X ${SRC})
+  endforeach()
+
+  message("cmake cwd: ${CMAKE_CURRENT_BINARY_DIR}")
+  add_custom_command(
+    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc .
+    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC}
+    DEPENDS hipify.py ${VLLM_EXT_SRC}
+    OUTPUT ${VLLM_EXT_ROCM_SRC})
+
+#  add_custom_command(
+#    COMMAND ./hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -i csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC}
+#    DEPENDS hipify.py ${VLLM_EXT_SRC}
+#    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+#    OUTPUT ${VLLM_EXT_ROCM_SRC}
+#    VERBATIM)
+
+#  set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC})
+  set(VLLM_EXT_SRC)
+  foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC})
+    list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC})
+  endforeach()
+  message("final src: ${VLLM_EXT_SRC}")
 endif()
 
+list(APPEND VLM_EXT_SRC "csrc/pybind.cpp") #?
+
 define_module_target(_C
   "${VLLM_EXT_SRC}"
   "${VLLM_NVCC_FLAGS}"
@@ -361,3 +469,9 @@ define_module_target(_punica_C
   "${VLLM_PUNICA_EXT_SRC}"
   "${VLLM_PUNICA_NVCC_FLAGS}"
   "${VLLM_PUNICA_CUDA_ARCHES}")
+
+get_cmake_property(_variableNames VARIABLES)
+list (SORT _variableNames)
+foreach (_variableName ${_variableNames})
+    message(STATUS "${_variableName}=${${_variableName}}")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
index c2354ca1f470d..97e629dc07abb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,6 +38,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
+COPY hipify.py hipify.py
 COPY CMakeLists.txt CMakeLists.txt
 COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
diff --git a/hipify.py b/hipify.py
new file mode 100755
index 0000000000000..93e5c9d78d6a4
--- /dev/null
+++ b/hipify.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+
+from torch.utils.hipify.hipify_python import hipify
+
+if __name__ == '__main__':
+    print(f"CWD {os.getcwd()}")
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-b",
+        "--build_dir",
+        help="The build directory.",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        help="The output directory.",
+    )
+
+    parser.add_argument(
+        "-i",
+        "--include_dir",
+        help="Include directory",
+        action="append",
+        default=[],
+    )
+
+    parser.add_argument(
+        "sources",
+        help="Source files to hipify.",
+        nargs="*",
+        default=[]
+    )
+
+    args = parser.parse_args()
+
+    print(args.output_dir)
+
+    # limit scope to build_dir only
+    includes = [os.path.join(args.build_dir, '*')]
+    print(f"includes {includes}")
+
+    extra_files = [os.path.abspath(s) for s in args.sources]
+    print(f"extra_files {extra_files}")
+
+    hipify_result = hipify(
+        project_directory=args.build_dir,
+        output_directory=args.output_dir,
+        header_include_dirs=[],
+        includes=includes,
+        extra_files=extra_files,
+        show_detailed=True,
+        is_pytorch_extension=True,
+        hipify_extra_files_only=True)
+
+    #print(hipify_result)
+
+    hipified_sources = []
+    for source in args.sources:
+        s_abs = os.path.abspath(source)
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if (s_abs in hipify_result and
+                          hipify_result[s_abs].hipified_path is not None) else s_abs)
+        if True:
+            hipified_sources.append(hipified_s_abs)
+        else:
+            hipified_sources.append(
+                os.path.relpath(hipified_s_abs,
+                                os.path.abspath(os.path.join(args.build_dir, os.pardir))))
+
+    assert(len(hipified_sources) == len(args.sources))
+
+    #    print("\n".join(hipified_sources))
+
+#    print(f"got here {args.output_dir}")
+#    os.system(f"find {args.output_dir} -name '*.hip'")
+#    print("end got here")
+
+#    print(f"got here root")
+#    os.system(f"find /app/vllm -name '*.hip'")
+#    print("end got here root")
+
+# project_directory /app/vllm
+# show_detailed True
+# extensions ('.cu', '.cuh', '.c', '.cc', '.cpp', '.h', '.in', '.hpp')
+# header_extensions ('.cuh', '.h', '.hpp')
+# output_directory /app/vllm
+# header_include_dirs []
+# includes ['/app/vllm/*']
+# extra_files [
+#     '/app/vllm/csrc/cache_kernels.cu',
+#     '/app/vllm/csrc/attention/attention_kernels.cu',
+#     '/app/vllm/csrc/pos_encoding_kernels.cu',
+#     '/app/vllm/csrc/activation_kernels.cu',
+#     '/app/vllm/csrc/layernorm_kernels.cu',
+#     '/app/vllm/csrc/quantization/squeezellm/quant_cuda_kernel.cu',
+#     '/app/vllm/csrc/quantization/gptq/q_gemm.cu',
+#     '/app/vllm/csrc/cuda_utils_kernels.cu',
+#     '/app/vllm/csrc/moe_align_block_size_kernels.cu',
+#     '/app/vllm/csrc/pybind.cpp'
+# ]
+# out_of_place_only False
+# ignores ()
+# show_progress True
+# hip_clang_launch False
+# is_pytorch_extension True
+# hipify_extra_files_only True
diff --git a/setup.py b/setup.py
index 69fcfa047ef97..79e9f852db76f 100644
--- a/setup.py
+++ b/setup.py
@@ -69,10 +69,7 @@ def build_extensions(self):
                 # temporary build directory instead
                 '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format(
                     cfg.upper(), self.build_temp),
-                # Hint CMake to use the same Python executable that
-                # is launching the build, prevents possible mismatching if
-                # multiple versions of Python are installed
-                '-DPYTHON_EXECUTABLE={}'.format(sys.executable),
+                '--log-level=TRACE',
             ]
 
             # TODO: change default to 0
@@ -113,6 +110,7 @@ def build_extensions(self):
                 build_jobs = ['-j', str(num_jobs)]
 
             # Config
+            # TODO: this only needs to happen once
             subprocess.check_call(['cmake', ext.cmake_lists_dir] + build_tool +
                                   cmake_args,
                                   cwd=self.build_temp)

From 3d042550aa742a6c7ca84b7ef2003fec8eda99b1 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 19 Feb 2024 21:11:45 -0500
Subject: [PATCH 32/76] hacked up rocm support

---
 CMakeLists.txt | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 666d213036995..18a3355241cd5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,15 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
+function (dumpvars MSG)
+  message(${MSG})
+  get_cmake_property(_variableNames VARIABLES)
+  list (SORT _variableNames)
+  foreach (_variableName ${_variableNames})
+    message(STATUS "${_variableName}=${${_variableName}}")
+  endforeach()
+endfunction()
+
 #
 # Supported python verions.  These versions will be searched in order, the
 # first match will be selected.
@@ -148,10 +157,11 @@ if (HIP_FOUND)
   list(APPEND VLLM_NVCC_FLAGS "${X}")
 
   list(APPEND VLLM_NVCC_FLAGS
-    "-DWITH_HIP"
+#    "-DWITH_HIP"
     "-DUSE_ROCM"
     "-U__HIP_NO_HALF_CONVERSIONS__"
-    "-U__HIP_NO_HALF_OPERATORS__")
+    "-U__HIP_NO_HALF_OPERATORS__"
+    "-fno-gpu-rdc")
 
   # hack
 #  set(CMAKE_CUDA_COMPILER ${hip_HIPCC_EXECUTABLE}) # ${ROCM_PATH}/bin/hipcc)
@@ -317,6 +327,8 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS
     Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
 #    hip_add_library(${MOD_NAME} MODULE ${MOD_SRC})
 
+    add_dependencies(${MOD_NAME} hipify)
+
     set(CUDA_LANG "HIP")
     foreach(SRC ${MOD_SRC})
       if (${SRC} MATCHES "\.hip$")
@@ -390,20 +402,37 @@ if(IS_CUDA)
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu")
 elseif(HIP_FOUND)
+  dumpvars("VARS")
+
   # maybe use add_custom_target instead + add_dependencies?
   message("build dir  ${CMAKE_CURRENT_BINARY_DIR}")
 
   set(X)
-  foreach (SRC ${VLLM_EXT_SRC})
-    list(APPEND X ${SRC})
+  foreach (SRC ${VLLM_EXT_ROCM_SRC})
+    list(APPEND X "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
   endforeach()
+  set(VLLM_EXT_ROCM_SRC ${X})
 
   message("cmake cwd: ${CMAKE_CURRENT_BINARY_DIR}")
-  add_custom_command(
-    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc .
-    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC}
+#  add_custom_command(
+#    COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO"
+#    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc .
+#    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC}
+#    DEPENDS hipify.py ${VLLM_EXT_SRC}
+#    OUTPUT ${VLLM_EXT_ROCM_SRC}
+#    COMMENT "run hipify")
+
+  message("rocm src: ${VLLM_EXT_ROCM_SRC}")
+
+  add_custom_target(
+    hipify
+    COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO2"
+    COMMAND pwd
+    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR}
+    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc -i ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC}
     DEPENDS hipify.py ${VLLM_EXT_SRC}
-    OUTPUT ${VLLM_EXT_ROCM_SRC})
+    BYPRODUCTS ${VLLM_EXT_ROCM_SRC}
+    COMMENT "run hipify2")
 
 #  add_custom_command(
 #    COMMAND ./hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -i csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC}
@@ -412,15 +441,17 @@ elseif(HIP_FOUND)
 #    OUTPUT ${VLLM_EXT_ROCM_SRC}
 #    VERBATIM)
 
-#  set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC})
-  set(VLLM_EXT_SRC)
-  foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC})
-    list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC})
-  endforeach()
+   set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC})
+
+#  set(VLLM_EXT_SRC)
+#  foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC})
+#    list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC})
+#  endforeach()
   message("final src: ${VLLM_EXT_SRC}")
 endif()
 
-list(APPEND VLM_EXT_SRC "csrc/pybind.cpp") #?
+list(APPEND VLLM_EXT_SRC "csrc/pybind.cpp") # or leave in original list?
+#set(VLLM_EXT_SRC "csrc/pybind.cpp")
 
 define_module_target(_C
   "${VLLM_EXT_SRC}"
@@ -469,9 +500,3 @@ define_module_target(_punica_C
   "${VLLM_PUNICA_EXT_SRC}"
   "${VLLM_PUNICA_NVCC_FLAGS}"
   "${VLLM_PUNICA_CUDA_ARCHES}")
-
-get_cmake_property(_variableNames VARIABLES)
-list (SORT _variableNames)
-foreach (_variableName ${_variableNames})
-    message(STATUS "${_variableName}=${${_variableName}}")
-endforeach()

From 84bdbc35e7f1805205e572a2a8e959d5e7a8cd7f Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 19 Feb 2024 21:13:32 -0500
Subject: [PATCH 33/76] fix merge issue

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18a3355241cd5..e07273eca2a58 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,7 +21,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 # Supported AMD GPU architectures
-set(ROCM_SUPPORTED_ARCHS "gfx90a;gfx942;gfx1100")
+set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 
 #
 # Loop thru all supported python versions until we find the first suitable

From 6ad7e54e80b7166569638638461c607fcea606c0 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 19 Feb 2024 21:22:27 -0500
Subject: [PATCH 34/76] run format.sh

---
 hipify.py | 40 ++++++++++++++++++++--------------------
 setup.py  |  1 -
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/hipify.py b/hipify.py
index 93e5c9d78d6a4..d7cb1a0ed0ed6 100755
--- a/hipify.py
+++ b/hipify.py
@@ -30,12 +30,10 @@
         default=[],
     )
 
-    parser.add_argument(
-        "sources",
-        help="Source files to hipify.",
-        nargs="*",
-        default=[]
-    )
+    parser.add_argument("sources",
+                        help="Source files to hipify.",
+                        nargs="*",
+                        default=[])
 
     args = parser.parse_args()
 
@@ -48,31 +46,33 @@
     extra_files = [os.path.abspath(s) for s in args.sources]
     print(f"extra_files {extra_files}")
 
-    hipify_result = hipify(
-        project_directory=args.build_dir,
-        output_directory=args.output_dir,
-        header_include_dirs=[],
-        includes=includes,
-        extra_files=extra_files,
-        show_detailed=True,
-        is_pytorch_extension=True,
-        hipify_extra_files_only=True)
+    hipify_result = hipify(project_directory=args.build_dir,
+                           output_directory=args.output_dir,
+                           header_include_dirs=[],
+                           includes=includes,
+                           extra_files=extra_files,
+                           show_detailed=True,
+                           is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
 
     #print(hipify_result)
 
     hipified_sources = []
     for source in args.sources:
         s_abs = os.path.abspath(source)
-        hipified_s_abs = (hipify_result[s_abs].hipified_path if (s_abs in hipify_result and
-                          hipify_result[s_abs].hipified_path is not None) else s_abs)
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
+                          (s_abs in hipify_result
+                           and hipify_result[s_abs].hipified_path is not None)
+                          else s_abs)
         if True:
             hipified_sources.append(hipified_s_abs)
         else:
             hipified_sources.append(
-                os.path.relpath(hipified_s_abs,
-                                os.path.abspath(os.path.join(args.build_dir, os.pardir))))
+                os.path.relpath(
+                    hipified_s_abs,
+                    os.path.abspath(os.path.join(args.build_dir, os.pardir))))
 
-    assert(len(hipified_sources) == len(args.sources))
+    assert (len(hipified_sources) == len(args.sources))
 
     #    print("\n".join(hipified_sources))
 
diff --git a/setup.py b/setup.py
index 79e9f852db76f..006897de3812b 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,6 @@
 import os
 import re
 import subprocess
-import sys
 from typing import List
 
 from packaging.version import parse, Version

From 1d50fa7fea622f14b1870ada4e48b63202cdd7d0 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 19 Feb 2024 21:23:30 -0500
Subject: [PATCH 35/76] fix enable_language

---
 CMakeLists.txt | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e07273eca2a58..17709e09d0a18 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,9 +77,6 @@ endmacro()
 #
 find_package(MPI)
 
-#find_package(HIP)
-enable_language(HIP)  # use FindHIP?
-
 #
 # Find where user site-packages and torch are installed and add it to cmake's
 # search path.
@@ -128,7 +125,7 @@ endif()
 #
 if (HIP_FOUND)
   message("HIP FOUND")
-#  enable_language(HIP)  # use FindHIP?
+  enable_language(HIP)  # use FindHIP?
 
   # TODO: intersect with this list?
   if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)

From 10e35263a714260124ad58923881e2e924cbf753 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 00:37:00 -0500
Subject: [PATCH 36/76] cleanups

---
 CMakeLists.txt | 261 ++++++++++++++++++++++---------------------------
 1 file changed, 117 insertions(+), 144 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17709e09d0a18..61208af169e84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,15 +2,6 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-function (dumpvars MSG)
-  message(${MSG})
-  get_cmake_property(_variableNames VARIABLES)
-  list (SORT _variableNames)
-  foreach (_variableName ${_variableNames})
-    message(STATUS "${_variableName}=${${_variableName}}")
-  endforeach()
-endfunction()
-
 #
 # Supported python verions.  These versions will be searched in order, the
 # first match will be selected.
@@ -99,11 +90,8 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 # Torch also imports CUDA/HIP packages with some customizations, so we do not
 # need to do this explicitly with check_language/enable_language, etc.
 #
-#find_package(Torch 2.1.2 EXACT REQUIRED)
 find_package(Torch REQUIRED)
 
-# TODO: warn about wrong version
-
 # For some reason torch does not add libtorch_python.so to the list of torch
 # libraries to link.  Find it by hand using 'append_torchlib_if_found' from
 # torch's cmake setup.
@@ -115,83 +103,109 @@ endif()
 
 if (NOT HIP_FOUND AND CUDA_FOUND)
   set(IS_CUDA true)
+
+  # Verify torch version and warn if it is not expected.
+  if (NOT Torch_VERSION VERSION_EQUAL 2.1.2)
+    message(WARNING "Pytorch version 2.1.2 expected for CUDA build, "
+      "saw ${Torch_VERSION} instead.")
+  endif()
+else()
+  # Verify torch version and warn if it is not expected (derived from Dockerfile.rocm)
+  # ROCm 5.7 -> torch 2.0.1
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND NOT Torch_VERSION VERSION_EQUAL 2.0.1)
+    message(WARNING "Pytorch version 2.0.1 expected for ROCMm 5.x build, "
+      "saw ${Torch_VERSION} instead.")
+  endif()
+
+  # ROCm 6.0 -> torch 2.1.1
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND NOT Torch_VERSION VERSION_EQUAL 2.1.1)
+    message(WARNING "Pytorch version 2.1.1 expected for ROCMm 6.x build, "
+      "saw ${Torch_VERSION} instead.")
+  endif()
 endif()
 
 #
-# Setup extra NVCC flags
+# Setup extra platform specific GPU compilation flags, e.g. NVCC flags for CUDA
+# and hip flags for ROCm.
 #
 # Note: CUDA + HIP are detected by pytorch package so there's no need to repeat
 # detect them explicitly with check_language, etc.
 #
 if (HIP_FOUND)
-  message("HIP FOUND")
-  enable_language(HIP)  # use FindHIP?
+  # Importing torch recognizes and sets up some HIP/ROCm stuff but not all.
+  # If we want cmake to be able to understand the .hip extension automatically,
+  # we need to enable HIP explicitly.
+  enable_language(HIP)
 
-  # TODO: intersect with this list?
-  if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
-    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}")
-  else()
-    set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
-  endif()
-  set(VLLM_SUPPORTED_HIP_ARCHITECTURES "${ROCM_SUPPORTED_ARCHS}")
+  #
+  # VLLM_HIP_ARCHITECUTRES will control the offload-arch flags.
+  # CMAKE_HIP_ARCHITECTURES is setup by pytorch and can be controlled
+  # via the PYTORCH_ROCM_ARCH env variable.
+  #
+  # Find the intersection of the supported + detected architectures to
+  # set the module architecture flags.
+  #
+  set(VLLM_HIP_ARCHITECTURES)
+  foreach (ARCH ${CMAKE_HIP_ARCHITECTURES})
+    if (ARCH IN_LIST ROCM_SUPPORTED_ARCHS)
+      list(APPEND VLLM_HIP_ARCHITECTURES ${ARCH})
+    endif()
+  endforeach()
 
-  #  foreach(HIP_ARCH ${CMAKE_HIP_ARCHITECTURES})
-  #    list(APPEND VLLM_NVCC_FLAGS "--offload-arch=${HIP_ARCH}")
-  #  endforeach()
+  if(NOT VLLM_HIP_ARCHITECTURES)
+    message(FATAL_ERROR
+      "None of the detected ROCM architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+      " supported. Supported ROCM architectures are: ${ROCM_SUPPORTED_ARCHS}.")
+  endif()
 
-  set(VLLM_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES})
-  set(VLLM_PUNICA_CUDA_ARCHES ${VLLM_SUPPORTED_HIP_ARCHITECTURES})
+  set(VLLM_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES})
+  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES})
 
-  # Get common NVCC flags from torch.
-  run_python(VLLM_NVCC_FLAGS
+  # Get common HIP/HIPCC flags from torch.
+  run_python(VLLM_GPU_FLAGS
     "from torch.utils.cpp_extension import COMMON_HIP_FLAGS; print(';'.join(COMMON_HIP_FLAGS))"
     "Failed to determine torch nvcc compiler flags")
 
-  run_python(X
+  run_python(VLLM_HIPCC_FLAGS
     "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))"
     "Failed to determine torch nvcc compiler flags")
 
-  list(APPEND VLLM_NVCC_FLAGS "${X}")
+  list(APPEND VLLM_GPU_FLAGS ${VLLM_HIPCC_FLAGS})
 
-  list(APPEND VLLM_NVCC_FLAGS
-#    "-DWITH_HIP"
+  list(APPEND VLLM_GPU_FLAGS
     "-DUSE_ROCM"
     "-U__HIP_NO_HALF_CONVERSIONS__"
     "-U__HIP_NO_HALF_OPERATORS__"
     "-fno-gpu-rdc")
 
-  # hack
-#  set(CMAKE_CUDA_COMPILER ${hip_HIPCC_EXECUTABLE}) # ${ROCM_PATH}/bin/hipcc)
-# enable_language(CUDA)  # use FindHIP?
-
 else()
   # Get common NVCC flags from torch.
-  run_python(VLLM_NVCC_FLAGS
+  run_python(VLLM_GPU_FLAGS
     "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
     "Failed to determine torch nvcc compiler flags")
 
   if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-    list(APPEND VLLM_NVCC_FLAGS "-DENABLE_FP8_E5M2")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_FP8_E5M2")
   endif()
 
   if(NVCC_THREADS)
-    list(APPEND VLLM_NVCC_FLAGS "--threads=${NVCC_THREADS}")
+    list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
   endif()
 
-  set(VLLM_PUNICA_NVCC_FLAGS ${VLLM_NVCC_FLAGS})
+  set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
 
   #
   # Copy flags+update for punica
   #
 
-  list(REMOVE_ITEM VLLM_PUNICA_NVCC_FLAGS
+  list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
     "-D__CUDA_NO_HALF_OPERATORS__"
     "-D__CUDA_NO_HALF_CONVERSIONS__"
     "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
     "-D__CUDA_NO_HALF2_OPERATORS__")
 
-  message(DEBUG "nvcc: ${VLLM_NVCC_FLAGS}")
-  message(DEBUG "punica nvcc: ${VLLM_PUNICA_NVCC_FLAGS}")
+  message(DEBUG "nvcc: ${VLLM_GPU_FLAGS}")
+  message(DEBUG "punica nvcc: ${VLLM_PUNICA_GPU_FLAGS}")
 
   #
   # Setup/process CUDA arch flags
@@ -238,8 +252,8 @@ else()
   endmacro()
 
   # Initialize the architecure lists to empty.
-  set(VLLM_CUDA_ARCHES)
-  set(VLLM_PUNICA_CUDA_ARCHES)
+  set(VLLM_GPU_ARCHES)
+  set(VLLM_PUNICA_GPU_ARCHES)
 
   # Process each 'gencode' flag.
   foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
@@ -291,16 +305,16 @@ else()
     endif()
 
     # Add it to the arch list
-    list(APPEND VLLM_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
+    list(APPEND VLLM_GPU_ARCHES "${CODE_ARCH}${VIRT}")
 
     # Add it to punica arch list if the version is >= 8.0
     if (CODE_VER GREATER_EQUAL 8.0)
-      list(APPEND VLLM_PUNICA_CUDA_ARCHES "${CODE_ARCH}${VIRT}")
+      list(APPEND VLLM_PUNICA_GPU_ARCHES "${CODE_ARCH}${VIRT}")
     endif()
   endforeach()
 
-  message(DEBUG "nvcc arch: ${VLLM_CUDA_ARCHES}")
-  message(DEBUG "punica arch: ${VLLM_PUNICA_CUDA_ARCHES}")
+  message(DEBUG "nvcc arch: ${VLLM_GPU_ARCHES}")
+  message(DEBUG "punica arch: ${VLLM_PUNICA_GPU_ARCHES}")
 endif()
 
 #
@@ -309,45 +323,30 @@ endif()
 
 # add comment
 # Note: optimization level/debug info is set via cmake build type.
-function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS
-    MOD_CUDA_ARCHES)
+function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS
+    MOD_GPU_ARCHES)
 
-  # TODO: needed for rocm?
-  if (IS_CUDA)
-    Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
-
-    set(CUDA_LANG "CUDA")
-    set_target_properties(${MOD_NAME} PROPERTIES CUDA_ARCHITECTURES
-      "${MOD_CUDA_ARCHES}")
+  # Note: for ROCm builds we let the proper flags and libraries get
+  # pulled in by torch.
+  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
 
+  if (IS_CUDA)
+    set(GPU_LANG "CUDA")
   else()
-    Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
-#    hip_add_library(${MOD_NAME} MODULE ${MOD_SRC})
-
+    set(GPU_LANG "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    # TODO: consider moving hipify step into here so it could apply to
+    # any target
     add_dependencies(${MOD_NAME} hipify)
-
-    set(CUDA_LANG "HIP")
-    foreach(SRC ${MOD_SRC})
-      if (${SRC} MATCHES "\.hip$")
-        message("setting HIP on ${SRC}")
-        set_source_files_properties(${SRC} PROPERTIES LANGUAGE HIP)
-        #set_source_files_properties(${SRC} PROPERTIES LANGUAGE CUDA)
-        #set_source_files_properties(${SRC} PROPERTIES LANGUAGE ${CUDA_LANG})
-      endif()
-    endforeach()
-
-    message("got here! ${MOD_CUDA_ARCHES}")
-    set_target_properties(${MOD_NAME} PROPERTIES HIP_ARCHITECTURES
-      "${MOD_CUDA_ARCHES}")
-
   endif()
 
-  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
+  set_target_properties(${MOD_NAME} PROPERTIES ${GPU_LANG}_ARCHITECTURES
+      "${MOD_GPU_ARCHES}")
 
-#  set_target_properties(${MOD_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
 
   target_compile_options(${MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${CUDA_LANG}>:${MOD_EXTRA_NVCC_FLAGS}>)
+    $<$<COMPILE_LANGUAGE:${GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
 
   target_compile_definitions(${MOD_NAME} PRIVATE
     "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
@@ -355,15 +354,7 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_NVCC_FLAGS
   target_include_directories(${MOD_NAME} PRIVATE
     csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS})
 
-
-  if (TRUE OR IS_CUDA)
-    target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
-  else()
-    # -- Python_SOABI=cpython-39-x86_64-linux-gnu
-    message("got here ${_PYTHON_INCLUDE_DIR}, ${_PYTHON_LIBRARY}")
-    target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES} ${_Python_LIBRARY})
-    target_include_directories(${MOD_NAME} PRIVATE ${Python_INCLUDE_DIRS})
-  endif()
+  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
 
   install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
 endfunction()
@@ -383,77 +374,59 @@ set(VLLM_EXT_SRC
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu")
 
-set(VLLM_EXT_ROCM_SRC
-  "csrc/cache_kernels.hip"
-  "csrc/attention/attention_kernels.hip"
-  "csrc/pos_encoding_kernels.hip"
-  "csrc/activation_kernels.hip"
-  "csrc/layernorm_kernels.hip"
-  "csrc/quantization/squeezellm/quant_hip_kernel.hip"
-  "csrc/quantization/gptq/q_gemm.hip"
-  "csrc/hip_utils_kernels.hip"
-  "csrc/moe_align_block_size_kernels.hip")
+set(VLLM_EXT_CXX_SRC "csrc/pybind.cpp")
 
 if(IS_CUDA)
   list(APPEND VLLM_EXT_SRC
+    ${VLLM_EXT_CXX_SRC}
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu")
-elseif(HIP_FOUND)
-  dumpvars("VARS")
 
-  # maybe use add_custom_target instead + add_dependencies?
-  message("build dir  ${CMAKE_CURRENT_BINARY_DIR}")
+elseif(HIP_FOUND)
 
-  set(X)
-  foreach (SRC ${VLLM_EXT_ROCM_SRC})
-    list(APPEND X "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  #
+  # Generate ROCM/HIP source file names from CUDA file names.
+  #
+  set(VLLM_ROCM_EXT_SRC)
+  foreach (SRC ${VLLM_EXT_SRC})
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    list(APPEND VLLM_EXT_ROCM_SRC "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
   endforeach()
-  set(VLLM_EXT_ROCM_SRC ${X})
+  message("rocm src: ${VLLM_EXT_ROCM_SRC}")
 
-  message("cmake cwd: ${CMAKE_CURRENT_BINARY_DIR}")
-#  add_custom_command(
-#    COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO"
-#    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc .
-#    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC}
-#    DEPENDS hipify.py ${VLLM_EXT_SRC}
-#    OUTPUT ${VLLM_EXT_ROCM_SRC}
-#    COMMENT "run hipify")
+  # TODO: this needs a bunch more work
 
-  message("rocm src: ${VLLM_EXT_ROCM_SRC}")
+  #  add_custom_command(
+  #    COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO"
+  #    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc .
+  #    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC}
+  #    DEPENDS hipify.py ${VLLM_EXT_SRC}
+  #    OUTPUT ${VLLM_EXT_ROCM_SRC}
+  #    COMMENT "run hipify")
 
+  # TODO: move copy into python (or figure out how to make hipify work properly)
+  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
   add_custom_target(
     hipify
-    COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO2"
-    COMMAND pwd
     COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR}
-    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc -i ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC}
+    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${VLLM_EXT_SRC}
     DEPENDS hipify.py ${VLLM_EXT_SRC}
     BYPRODUCTS ${VLLM_EXT_ROCM_SRC}
-    COMMENT "run hipify2")
-
-#  add_custom_command(
-#    COMMAND ./hipify.py -b ${CMAKE_CURRENT_BINARY_DIR}/csrc -i csrc -o ${CMAKE_CURRENT_BINARY_DIR}/csrc ${VLLM_EXT_SRC}
-#    DEPENDS hipify.py ${VLLM_EXT_SRC}
-#    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-#    OUTPUT ${VLLM_EXT_ROCM_SRC}
-#    VERBATIM)
-
-   set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC})
-
-#  set(VLLM_EXT_SRC)
-#  foreach (ROCM_SRC ${VLLM_EXT_ROCM_SRC})
-#    list(APPEND VLLM_EXT_SRC ${CMAKE_SOURCE_DIR}/${ROCM_SRC})
-#  endforeach()
-  message("final src: ${VLLM_EXT_SRC}")
-endif()
+    COMMENT "Running hipify on extension source files.")
+
+  # Swap out original extension sources with hipified sources.
+  set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC})
+  list(APPEND VLLM_EXT_SRC ${VLLM_EXT_CXX_SRC})
 
-list(APPEND VLLM_EXT_SRC "csrc/pybind.cpp") # or leave in original list?
-#set(VLLM_EXT_SRC "csrc/pybind.cpp")
+  message(DEBUG "final ext src: ${VLLM_EXT_SRC}")
+
+endif()
 
 define_module_target(_C
   "${VLLM_EXT_SRC}"
-  "${VLLM_NVCC_FLAGS}"
-  "${VLLM_CUDA_ARCHES}")
+  "${VLLM_GPU_FLAGS}"
+  "${VLLM_GPU_ARCHES}")
 
 #
 # _moe_C extension
@@ -465,8 +438,8 @@ set(VLLM_MOE_EXT_SRC
 
 define_module_target(_moe_C
   "${VLLM_MOE_EXT_SRC}"
-  "${VLLM_NVCC_FLAGS}"
-  "${VLLM_CUDA_ARCHES}")
+  "${VLLM_GPU_FLAGS}"
+  "${VLLM_GPU_ARCHES}")
 
 #
 # _punica_C extension
@@ -495,5 +468,5 @@ set(VLLM_PUNICA_EXT_SRC
 
 define_module_target(_punica_C
   "${VLLM_PUNICA_EXT_SRC}"
-  "${VLLM_PUNICA_NVCC_FLAGS}"
-  "${VLLM_PUNICA_CUDA_ARCHES}")
+  "${VLLM_PUNICA_GPU_FLAGS}"
+  "${VLLM_PUNICA_GPU_ARCHES}")

From b56afb1315ad3929e75aca45eddeb8a6380160f6 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 01:24:08 -0500
Subject: [PATCH 37/76] more hipify cleanups

---
 CMakeLists.txt | 92 +++++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 42 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 61208af169e84..cc5b1b303941d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -321,11 +321,57 @@ endif()
 # Define targets
 #
 
+# add comment
+macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources
+  #
+  set(SRCS ${ORIG_SRCS})
+  set(CXX_SRCS ${ORIG_SRCS})
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+
+  message(DEBUG "sources: ${SRCS}")
+  message(DEBUG "cxx sources: ${CXX_SRCS}")
+
+  #
+  # Generate ROCM/HIP source file names from CUDA file names.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+  message("rocm src: ${HIP_SRCS}")
+
+  # TODO: hipify script needs a bunch more work
+  # TODO: move copy into python (or figure out how to make hipify work properly)
+  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
+  add_custom_target(
+    hipify${NAME}
+    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR}
+    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${SRCS}
+    DEPENDS hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on extension source files.")
+
+  # Swap out original extension sources with hipified sources.
+  set(${OUT_SRCS} ${HIP_SRCS})
+  list(APPEND ${OUT_SRCS} ${CXX_SRCS})
+
+  message(DEBUG "final ext src: ${OUT_SRCS}")
+endmacro()
+
 # add comment
 # Note: optimization level/debug info is set via cmake build type.
 function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS
     MOD_GPU_ARCHES)
 
+  if (NOT IS_CUDA AND HIP_FOUND)
+    hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
+  endif()
+
   # Note: for ROCm builds we let the proper flags and libraries get
   # pulled in by torch.
   Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
@@ -334,10 +380,9 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS
     set(GPU_LANG "CUDA")
   else()
     set(GPU_LANG "HIP")
+
     # Make this target dependent on the hipify preprocessor step.
-    # TODO: consider moving hipify step into here so it could apply to
-    # any target
-    add_dependencies(${MOD_NAME} hipify)
+    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
   endif()
 
   set_target_properties(${MOD_NAME} PROPERTIES ${GPU_LANG}_ARCHITECTURES
@@ -372,54 +417,17 @@ set(VLLM_EXT_SRC
   "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/cuda_utils_kernels.cu"
-  "csrc/moe_align_block_size_kernels.cu")
-
-set(VLLM_EXT_CXX_SRC "csrc/pybind.cpp")
+  "csrc/moe_align_block_size_kernels.cu"
+  "csrc/pybind.cpp")
 
 if(IS_CUDA)
   list(APPEND VLLM_EXT_SRC
-    ${VLLM_EXT_CXX_SRC}
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu")
 
 elseif(HIP_FOUND)
 
-  #
-  # Generate ROCM/HIP source file names from CUDA file names.
-  #
-  set(VLLM_ROCM_EXT_SRC)
-  foreach (SRC ${VLLM_EXT_SRC})
-    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
-    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
-    list(APPEND VLLM_EXT_ROCM_SRC "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
-  endforeach()
-  message("rocm src: ${VLLM_EXT_ROCM_SRC}")
-
-  # TODO: this needs a bunch more work
-
-  #  add_custom_command(
-  #    COMMAND echo "HELLLLLLLLLLLLLLLLLLLLLLLLLLO"
-  #    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc .
-  #    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b csrc -o csrc -i csrc ${VLLM_EXT_SRC}
-  #    DEPENDS hipify.py ${VLLM_EXT_SRC}
-  #    OUTPUT ${VLLM_EXT_ROCM_SRC}
-  #    COMMENT "run hipify")
-
-  # TODO: move copy into python (or figure out how to make hipify work properly)
-  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
-  add_custom_target(
-    hipify
-    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR}
-    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${VLLM_EXT_SRC}
-    DEPENDS hipify.py ${VLLM_EXT_SRC}
-    BYPRODUCTS ${VLLM_EXT_ROCM_SRC}
-    COMMENT "Running hipify on extension source files.")
-
-  # Swap out original extension sources with hipified sources.
-  set(VLLM_EXT_SRC ${VLLM_EXT_ROCM_SRC})
-  list(APPEND VLLM_EXT_SRC ${VLLM_EXT_CXX_SRC})
 
-  message(DEBUG "final ext src: ${VLLM_EXT_SRC}")
 
 endif()
 

From b3f3b554c78c4a0d6d69bf242a5c2b68c9212ad8 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 01:29:47 -0500
Subject: [PATCH 38/76] comment

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc5b1b303941d..ef4a783388d87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -369,6 +369,7 @@ function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS
     MOD_GPU_ARCHES)
 
   if (NOT IS_CUDA AND HIP_FOUND)
+    # Add hipify preprocessing step if we are running on AMD.
     hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
   endif()
 

From 7f0c908e1be07e1714bf1a6c3efd03b4ee7163db Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 01:35:29 -0500
Subject: [PATCH 39/76] fix ruff

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef4a783388d87..3a92952d02bb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.21)
 project(vllm_extensions LANGUAGES CXX)
 
 #
-# Supported python verions.  These versions will be searched in order, the
+# Supported python versions.  These versions will be searched in order, the
 # first match will be selected.
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
@@ -251,7 +251,7 @@ else()
     string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
   endmacro()
 
-  # Initialize the architecure lists to empty.
+  # Initialize the architecture lists to empty.
   set(VLLM_GPU_ARCHES)
   set(VLLM_PUNICA_GPU_ARCHES)
 

From a8a8bd8befd46fb7f565cebf7efe0a643a49db3a Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 12:18:56 -0500
Subject: [PATCH 40/76] code cleanups

---
 CMakeLists.txt | 14 ++++----------
 hipify.py      | 50 +++++++-------------------------------------------
 2 files changed, 11 insertions(+), 53 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a92952d02bb9..3be55b302c2e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,8 @@ foreach(VER ${PYTHON_SUPPORTED_VERSIONS})
   message(STATUS "Attempting to find python ${VER} package.")
   find_package(Python ${VER} COMPONENTS Interpreter Development.Module)
   if (Python_FOUND)
+    # Note: find_package(Torch) won't work here because cmake might not
+    # have the proper search path set yet.
     execute_process(
       COMMAND
       "${Python_EXECUTABLE}" "-c" "import torch"
@@ -343,15 +345,12 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
     string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
     list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
   endforeach()
-  message("rocm src: ${HIP_SRCS}")
+  message(DEBUG "hip src: ${HIP_SRCS}")
 
-  # TODO: hipify script needs a bunch more work
-  # TODO: move copy into python (or figure out how to make hipify work properly)
   set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
   add_custom_target(
     hipify${NAME}
-    COMMAND cp -r ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_CURRENT_BINARY_DIR}
-    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CSRC_BUILD_DIR} -o ${CSRC_BUILD_DIR} -i ${CSRC_BUILD_DIR} ${SRCS}
+    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} -i ${CMAKE_SOURCE_DIR}/csrc ${SRCS}
     DEPENDS hipify.py ${SRCS}
     BYPRODUCTS ${HIP_SRCS}
     COMMENT "Running hipify on extension source files.")
@@ -425,11 +424,6 @@ if(IS_CUDA)
   list(APPEND VLLM_EXT_SRC
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu")
-
-elseif(HIP_FOUND)
-
-
-
 endif()
 
 define_module_target(_C
diff --git a/hipify.py b/hipify.py
index d7cb1a0ed0ed6..282a9062e3836 100755
--- a/hipify.py
+++ b/hipify.py
@@ -1,13 +1,12 @@
 #!/usr/bin/env python3
 
 import argparse
+import shutil
 import os
 
 from torch.utils.hipify.hipify_python import hipify
 
 if __name__ == '__main__':
-    print(f"CWD {os.getcwd()}")
-
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
@@ -37,14 +36,14 @@
 
     args = parser.parse_args()
 
-    print(args.output_dir)
-
     # limit scope to build_dir only
     includes = [os.path.join(args.build_dir, '*')]
-    print(f"includes {includes}")
 
     extra_files = [os.path.abspath(s) for s in args.sources]
-    print(f"extra_files {extra_files}")
+
+    # Copy sources from project directory to output directory.
+    # The directory might already exist to hold object files so we ignore that.
+    shutil.copytree(args.build_dir, args.output_dir, dirs_exist_ok=True)
 
     hipify_result = hipify(project_directory=args.build_dir,
                            output_directory=args.output_dir,
@@ -55,8 +54,6 @@
                            is_pytorch_extension=True,
                            hipify_extra_files_only=True)
 
-    #print(hipify_result)
-
     hipified_sources = []
     for source in args.sources:
         s_abs = os.path.abspath(source)
@@ -74,38 +71,5 @@
 
     assert (len(hipified_sources) == len(args.sources))
 
-    #    print("\n".join(hipified_sources))
-
-#    print(f"got here {args.output_dir}")
-#    os.system(f"find {args.output_dir} -name '*.hip'")
-#    print("end got here")
-
-#    print(f"got here root")
-#    os.system(f"find /app/vllm -name '*.hip'")
-#    print("end got here root")
-
-# project_directory /app/vllm
-# show_detailed True
-# extensions ('.cu', '.cuh', '.c', '.cc', '.cpp', '.h', '.in', '.hpp')
-# header_extensions ('.cuh', '.h', '.hpp')
-# output_directory /app/vllm
-# header_include_dirs []
-# includes ['/app/vllm/*']
-# extra_files [
-#     '/app/vllm/csrc/cache_kernels.cu',
-#     '/app/vllm/csrc/attention/attention_kernels.cu',
-#     '/app/vllm/csrc/pos_encoding_kernels.cu',
-#     '/app/vllm/csrc/activation_kernels.cu',
-#     '/app/vllm/csrc/layernorm_kernels.cu',
-#     '/app/vllm/csrc/quantization/squeezellm/quant_cuda_kernel.cu',
-#     '/app/vllm/csrc/quantization/gptq/q_gemm.cu',
-#     '/app/vllm/csrc/cuda_utils_kernels.cu',
-#     '/app/vllm/csrc/moe_align_block_size_kernels.cu',
-#     '/app/vllm/csrc/pybind.cpp'
-# ]
-# out_of_place_only False
-# ignores ()
-# show_progress True
-# hip_clang_launch False
-# is_pytorch_extension True
-# hipify_extra_files_only True
+    # Print hipified source files.
+    print("\n".join(hipified_sources))

From d9ed8b93daa5b4fa3c322faeda67390f63f69512 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 13:15:24 -0500
Subject: [PATCH 41/76] add comments

---
 CMakeLists.txt | 69 ++++++++++++++++++++++++++++++++------------------
 hipify.py      | 42 +++++++++++++++---------------
 2 files changed, 65 insertions(+), 46 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3be55b302c2e2..b084768041ac3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ project(vllm_extensions LANGUAGES CXX)
 
 #
 # Supported python versions.  These versions will be searched in order, the
-# first match will be selected.
+# first match is be selected.
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 
@@ -25,6 +25,8 @@ foreach(VER ${PYTHON_SUPPORTED_VERSIONS})
   message(STATUS "Attempting to find python ${VER} package.")
   find_package(Python ${VER} COMPONENTS Interpreter Development.Module)
   if (Python_FOUND)
+    # Attempt to import torch from python we just found. If so, stop
+    # searching for other versions of python.
     # Note: find_package(Torch) won't work here because cmake might not
     # have the proper search path set yet.
     execute_process(
@@ -66,7 +68,7 @@ macro (run_python OUT EXPR ERR_MSG)
 endmacro()
 
 #
-# Try to find MPI package
+# Try to find the MPI package
 #
 find_package(MPI)
 
@@ -103,6 +105,9 @@ if ((NOT HIP_FOUND) AND (NOT CUDA_FOUND))
   message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
 
+#
+# Check the torch version and warn if it isn't what is expected.
+#
 if (NOT HIP_FOUND AND CUDA_FOUND)
   set(IS_CUDA true)
 
@@ -134,15 +139,18 @@ endif()
 # detect them explicitly with check_language, etc.
 #
 if (HIP_FOUND)
-  # Importing torch recognizes and sets up some HIP/ROCm stuff but not all.
-  # If we want cmake to be able to understand the .hip extension automatically,
-  # we need to enable HIP explicitly.
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but
+  # does not let cmake recognize .hip files. If we want cmake to be able to
+  # understand the .hip extension automatically, we need to enable HIP
+  # explicitly.
   enable_language(HIP)
 
   #
-  # VLLM_HIP_ARCHITECUTRES will control the offload-arch flags.
-  # CMAKE_HIP_ARCHITECTURES is setup by pytorch and can be controlled
+  # VLLM_HIP_ARCHITECUTRES controls the --offload-arch flags.
+  # CMAKE_HIP_ARCHITECTURES is set up by pytorch and can be controlled
   # via the PYTORCH_ROCM_ARCH env variable.
+  #
+
   #
   # Find the intersection of the supported + detected architectures to
   # set the module architecture flags.
@@ -172,16 +180,17 @@ if (HIP_FOUND)
     "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))"
     "Failed to determine torch nvcc compiler flags")
 
-  list(APPEND VLLM_GPU_FLAGS ${VLLM_HIPCC_FLAGS})
-
   list(APPEND VLLM_GPU_FLAGS
+    ${VLLM_HIPCC_FLAGS}
     "-DUSE_ROCM"
     "-U__HIP_NO_HALF_CONVERSIONS__"
     "-U__HIP_NO_HALF_OPERATORS__"
     "-fno-gpu-rdc")
 
 else()
+  #
   # Get common NVCC flags from torch.
+  #
   run_python(VLLM_GPU_FLAGS
     "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
     "Failed to determine torch nvcc compiler flags")
@@ -199,7 +208,6 @@ else()
   #
   # Copy flags+update for punica
   #
-
   list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
     "-D__CUDA_NO_HALF_OPERATORS__"
     "-D__CUDA_NO_HALF_CONVERSIONS__"
@@ -213,13 +221,13 @@ else()
   # Setup/process CUDA arch flags
   #
   # The torch cmake setup detects and hardcodes the detected architecture flags
-  # in CMAKE_CUDA_FLAGS but we will need to filter/modify them for the supported
-  # architectures and the punica target.  So we have to extract and remove all
-  # the '-gencode' flags from CMAKE_CUDA_FLAGS for processing.  We can't use
-  # 'target_compiler_options' for adding '-gencode' arguments so we will use the
-  # target's CUDA_ARCHITECTURES property instead. This requires repackaging
-  # the architecture flags into a format that cmake expects for
-  # CUDA_ARCHITECTURES.
+  # in CMAKE_CUDA_FLAGS. Since CMAKE_CUDA_FLAGS is a "global" variable, we can't
+  # modify it on a per-target basis, i.e. for the punica extension.
+  # So we have to extract and remove all the '-gencode' flags from
+  # CMAKE_CUDA_FLAGS for processing. We can't use 'target_compiler_options'
+  # for adding '-gencode' arguments so we use the target's CUDA_ARCHITECTURES
+  # property instead. This requires repackaging the architecture flags into a
+  # format that cmake expects for CUDA_ARCHITECTURES.
   #
   # This is a bit fragile in that it depends on torch using -gencode as opposed
   # to one of the other nvcc options to specify architectures.
@@ -261,7 +269,7 @@ else()
   foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
     # For each flag we want to extract the version number and whether
     # it refers to PTX or native code.
-    # Note: if a regex matches then CMAKE_MATCH_1 will hold the binding
+    # Note: if a regex matches then CMAKE_MATCH_1 holds the binding
     # for that match.
 
     string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
@@ -323,7 +331,11 @@ endif()
 # Define targets
 #
 
-# add comment
+#
+# Add a target named `NAME` that runs the hipify preprocessor on a set of
+# CUDA source files. The names of the corresponding "hipified" sources
+# are stored in `OUT_SRCS`.
+#
 macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   #
   # Split into C++ and non-C++ (i.e. CUDA) sources
@@ -350,7 +362,7 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
   add_custom_target(
     hipify${NAME}
-    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -b ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} -i ${CMAKE_SOURCE_DIR}/csrc ${SRCS}
+    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
     DEPENDS hipify.py ${SRCS}
     BYPRODUCTS ${HIP_SRCS}
     COMMENT "Running hipify on extension source files.")
@@ -362,18 +374,27 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   message(DEBUG "final ext src: ${OUT_SRCS}")
 endmacro()
 
-# add comment
+#
+# Define a target named `MOD_NAME` for a single extension. The
+# arguments are:
+#
+# MOD_SRC             - the list of source files relative to CMakeLists.txt
+#                       directory.
+# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.
+# MOD_GPU_ARCHES      - a list of target GPU architectures in cmake format.
+#                       Refer to cmake documentation on CMAKE_CUDA_ARCHITECTURES
+#                       and CMAKE_HIP_ARCHITECTURES for more info.
+#
 # Note: optimization level/debug info is set via cmake build type.
+#
 function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS
     MOD_GPU_ARCHES)
 
+  # Add hipify preprocessing step if we are building with HIP/ROCm.
   if (NOT IS_CUDA AND HIP_FOUND)
-    # Add hipify preprocessing step if we are running on AMD.
     hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
   endif()
 
-  # Note: for ROCm builds we let the proper flags and libraries get
-  # pulled in by torch.
   Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
 
   if (IS_CUDA)
diff --git a/hipify.py b/hipify.py
index 282a9062e3836..c4d8450630ba3 100755
--- a/hipify.py
+++ b/hipify.py
@@ -1,5 +1,13 @@
 #!/usr/bin/env python3
 
+#
+# A command line tool for running pytorch's hipify preprocessor on CUDA
+# source files.
+#
+# See https://github.com/ROCm/hipify_torch
+# and <torch install dir>/utils/hipify/hipify_python.py
+#
+
 import argparse
 import shutil
 import os
@@ -9,26 +17,21 @@
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
 
+    # Project directory where all the source + include files live.
     parser.add_argument(
-        "-b",
-        "--build_dir",
-        help="The build directory.",
+        "-p",
+        "--project_dir",
+        help="The project directory.",
     )
 
+    # Directory where hipified files are written.
     parser.add_argument(
         "-o",
         "--output_dir",
         help="The output directory.",
     )
 
-    parser.add_argument(
-        "-i",
-        "--include_dir",
-        help="Include directory",
-        action="append",
-        default=[],
-    )
-
+    # Source files to convert.
     parser.add_argument("sources",
                         help="Source files to hipify.",
                         nargs="*",
@@ -36,16 +39,17 @@
 
     args = parser.parse_args()
 
-    # limit scope to build_dir only
-    includes = [os.path.join(args.build_dir, '*')]
+    # Limit include scope to project_dir only
+    includes = [os.path.join(args.project_dir, '*')]
 
+    # Get absolute path for all source files.
     extra_files = [os.path.abspath(s) for s in args.sources]
 
     # Copy sources from project directory to output directory.
     # The directory might already exist to hold object files so we ignore that.
-    shutil.copytree(args.build_dir, args.output_dir, dirs_exist_ok=True)
+    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
 
-    hipify_result = hipify(project_directory=args.build_dir,
+    hipify_result = hipify(project_directory=args.project_dir,
                            output_directory=args.output_dir,
                            header_include_dirs=[],
                            includes=includes,
@@ -61,13 +65,7 @@
                           (s_abs in hipify_result
                            and hipify_result[s_abs].hipified_path is not None)
                           else s_abs)
-        if True:
-            hipified_sources.append(hipified_s_abs)
-        else:
-            hipified_sources.append(
-                os.path.relpath(
-                    hipified_s_abs,
-                    os.path.abspath(os.path.join(args.build_dir, os.pardir))))
+        hipified_sources.append(hipified_s_abs)
 
     assert (len(hipified_sources) == len(args.sources))
 

From 55b73e9d317feeac6406886499f3348a863c797f Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 13:17:17 -0500
Subject: [PATCH 42/76] tweaks

---
 setup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 006897de3812b..ac2990e90654a 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
 
 
 def is_ccache_available() -> bool:
-    return which("ccacheX") is not None
+    return which("ccache") is not None
 
 
 def is_ninja_available() -> bool:
@@ -71,8 +71,7 @@ def build_extensions(self):
                 '--log-level=TRACE',
             ]
 
-            # TODO: change default to 0
-            verbose = bool(int(os.getenv('VERBOSE', '1')))
+            verbose = bool(int(os.getenv('VERBOSE', '0')))
             if verbose:
                 cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
 

From 298fbf242272f8bc5e113b2866c528bf5cbecf11 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 14:39:33 -0500
Subject: [PATCH 43/76] restore accidentally removed comment

---
 requirements-build.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-build.txt b/requirements-build.txt
index 8975f477fe96c..a8efcde590bbf 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,3 +1,4 @@
+# Should be mirrored in pyproject.toml
 cmake>=3.21
 ninja
 packaging

From 9bb0aebf5f8dd805f858c2aa66c38a477aee6059 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 22 Feb 2024 23:50:41 -0500
Subject: [PATCH 44/76] remove cmake logging

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index ac2990e90654a..1d0a133b8ce69 100644
--- a/setup.py
+++ b/setup.py
@@ -68,7 +68,6 @@ def build_extensions(self):
                 # temporary build directory instead
                 '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format(
                     cfg.upper(), self.build_temp),
-                '--log-level=TRACE',
             ]
 
             verbose = bool(int(os.getenv('VERBOSE', '0')))

From 17349a58a465fcc849e49a701422d0d2760434e8 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Fri, 23 Feb 2024 00:43:10 -0500
Subject: [PATCH 45/76] add 'supported' target

---
 CMakeLists.txt | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b084768041ac3..501069e9131cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -494,3 +494,47 @@ define_module_target(_punica_C
   "${VLLM_PUNICA_EXT_SRC}"
   "${VLLM_PUNICA_GPU_FLAGS}"
   "${VLLM_PUNICA_GPU_ARCHES}")
+
+#
+# Add 'supported' target which detects which extensions should be
+# built based on platform/architecture. This is the same logic that
+# setup.py uses to select which extensions should be built.
+#
+# The 'supported' target makes direct use of cmake easier since knowledge
+# of which extensions are supported have been factored in, e.g.
+#
+# cmake --build . --target supported
+#
+add_custom_target(supported)
+
+if (IS_CUDA OR HIP_FOUND)
+  message(STATUS "Enabling C extension.")
+  add_dependencies(supported _C)
+endif()
+
+if (IS_CUDA)
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(supported _moe_C)
+
+  set(ENABLE_PUNICA)
+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=1 or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment.
+  if (DEFINED VLLM_INSTALL_PUNICA_KERNELS OR ENV{VLLM_INSTALL_PUNICA_KERNELS})
+    set(ENABLE_PUNICA true)
+    foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
+      string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH})
+      string_to_ver(ARCH_VER ${ARCH_VER_STR})
+      if (ARCH_VER VERSION_LESS 8.0)
+        message(STATUS
+          "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.")
+        set(ENABLE_PUNICA false)
+        break()
+      endif()
+    endforeach()
+  endif()
+
+  if (ENABLE_PUNICA)
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(supported _punica_C)
+  endif()
+endif()

From 6fa22b51066a784a5736b48b24c865b77a077942 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Sat, 24 Feb 2024 00:15:06 -0500
Subject: [PATCH 46/76] cleanup comments, add variables for supported torch
 versions

---
 CMakeLists.txt | 193 ++++++++++++++++++++++++++++---------------------
 1 file changed, 109 insertions(+), 84 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 501069e9131cb..e8d8d573c5d04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,29 +4,43 @@ project(vllm_extensions LANGUAGES CXX)
 
 #
 # Supported python versions.  These versions will be searched in order, the
-# first match is be selected.
+# first match will be selected.  These should be kept in sync with setup.py.
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 
-# Supported NVIDIA architectures
+# Supported NVIDIA architectures.
 set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
-# Supported AMD GPU architectures
+# Supported AMD GPU architectures.
 set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 
 #
-# Loop thru all supported python versions until we find the first suitable
-# version that has torch installed.
+# Supported/expected torch versions for CUDA/ROCm.
 #
-# Cmake is unable to pick the lowest supported version when multiple
-# versions are available, even with CMAKE_FIND_PACKAGE_SORT_ORDER.
+# Currently, having an incorrect pytorch version results in a warning
+# rather than an error.
+#
+# Note: the CUDA torch version is derived from pyproject.toml and various
+# requirements.txt files and should be kept consistent.  The ROCm torch
+# versions are derived from Dockerfile.rocm
+#
+set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
+set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+
+#
+# Loop through all supported python versions until the first suitable version
+# that has torch installed.
+#
+# Note: cmake is unable to pick the lowest supported version when multiple
+# versions are available, even with `CMAKE_FIND_PACKAGE_SORT_ORDER`.
 #
 foreach(VER ${PYTHON_SUPPORTED_VERSIONS})
   message(STATUS "Attempting to find python ${VER} package.")
   find_package(Python ${VER} COMPONENTS Interpreter Development.Module)
   if (Python_FOUND)
-    # Attempt to import torch from python we just found. If so, stop
-    # searching for other versions of python.
+    # Attempt to import torch from python was just found. If torch can
+    # be imported, stop searching for other versions of python.
     # Note: find_package(Torch) won't work here because cmake might not
     # have the proper search path set yet.
     execute_process(
@@ -37,7 +51,8 @@ foreach(VER ${PYTHON_SUPPORTED_VERSIONS})
       ERROR_VARIABLE PYTHON_STDERR)
 
     if(PYTHON_ERROR_CODE EQUAL 0)
-      message(STATUS "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).")
+      message(STATUS
+        "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).")
       break()
     endif()
   endif()
@@ -45,13 +60,14 @@ endforeach()
 
 if (NOT Python_FOUND)
   message(FATAL_ERROR
-    "No supported version of python found. ('${PYTHON_SUPPORTED_VERSIONS}')")
+    "No supported version of python (with pytorch) found. "
+    "('${PYTHON_SUPPORTED_VERSIONS}')")
 endif()
 
 #
-# Run EXPR in python.  The standard output of python is stored in OUT and has
-# trailing whitespace stripped.  If an error is encountered when running python,
-# a fatal message ERR_MSG is issued.
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
 #
 macro (run_python OUT EXPR ERR_MSG)
   execute_process(
@@ -68,36 +84,38 @@ macro (run_python OUT EXPR ERR_MSG)
 endmacro()
 
 #
-# Try to find the MPI package
+# Try to find the MPI package.
 #
 find_package(MPI)
 
 #
-# Find where user site-packages and torch are installed and add it to cmake's
-# search path.
+# Update cmake's `CMAKE_PREFIX_PATH` with probably torch locations.
 #
 
-# Run EXPR in python after importing PKG. Use the result of this to extend
-# CMAKE_PREFIX_PATH so we can import the torch cmake configuration.
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
 macro (append_cmake_prefix_path PKG EXPR)
   run_python(PREFIX_PATH
     "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
   list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
 endmacro()
 
-# Add user site-packages and torch path to CMAKE_PREFIX_PATH
+# Add user site-packages path to `CMAKE_PREFIX_PATH`.
 append_cmake_prefix_path("site" "site.getusersitepackages()")
+
+# Query torch for its install path and add to `CMAKE_PREFIX_PATH`.
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
 #
 # Import torch cmake configuration.
-# Torch also imports CUDA/HIP packages with some customizations, so we do not
-# need to do this explicitly with check_language/enable_language, etc.
+# Torch also imports CUDA (and partially HIP) languages with some customizations,
+# so there is no need to do this explicitly with check_language/enable_language,
+# etc.
 #
 find_package(Torch REQUIRED)
 
 # For some reason torch does not add libtorch_python.so to the list of torch
-# libraries to link.  Find it by hand using 'append_torchlib_if_found' from
+# libraries to link.  Find it by hand using `append_torchlib_if_found` from
 # torch's cmake setup.
 append_torchlib_if_found(torch_python)
 
@@ -111,23 +129,23 @@ endif()
 if (NOT HIP_FOUND AND CUDA_FOUND)
   set(IS_CUDA true)
 
-  # Verify torch version and warn if it is not expected.
-  if (NOT Torch_VERSION VERSION_EQUAL 2.1.2)
-    message(WARNING "Pytorch version 2.1.2 expected for CUDA build, "
-      "saw ${Torch_VERSION} instead.")
+  if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
+      "expected for CUDA build, saw ${Torch_VERSION} instead.")
   endif()
 else()
-  # Verify torch version and warn if it is not expected (derived from Dockerfile.rocm)
-  # ROCm 5.7 -> torch 2.0.1
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND NOT Torch_VERSION VERSION_EQUAL 2.0.1)
-    message(WARNING "Pytorch version 2.0.1 expected for ROCMm 5.x build, "
-      "saw ${Torch_VERSION} instead.")
+  # ROCm 5.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
   endif()
 
-  # ROCm 6.0 -> torch 2.1.1
-  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND NOT Torch_VERSION VERSION_EQUAL 2.1.1)
-    message(WARNING "Pytorch version 2.1.1 expected for ROCMm 6.x build, "
-      "saw ${Torch_VERSION} instead.")
+  # ROCm 6.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
   endif()
 endif()
 
@@ -135,20 +153,19 @@ endif()
 # Setup extra platform specific GPU compilation flags, e.g. NVCC flags for CUDA
 # and hip flags for ROCm.
 #
-# Note: CUDA + HIP are detected by pytorch package so there's no need to repeat
-# detect them explicitly with check_language, etc.
+# Note: CUDA (and partially HIP) is detected by pytorch package so there's no
+# need to repeat detecting it explicitly with check_language, etc.
 #
 if (HIP_FOUND)
-  # Importing torch recognizes and sets up some HIP/ROCm configuration but
-  # does not let cmake recognize .hip files. If we want cmake to be able to
-  # understand the .hip extension automatically, we need to enable HIP
-  # explicitly.
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
   enable_language(HIP)
 
   #
-  # VLLM_HIP_ARCHITECUTRES controls the --offload-arch flags.
-  # CMAKE_HIP_ARCHITECTURES is set up by pytorch and can be controlled
-  # via the PYTORCH_ROCM_ARCH env variable.
+  # `VLLM_HIP_ARCHITECUTRES` controls the `--offload-arch` flags.
+  # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+  # via the `PYTORCH_ROCM_ARCH` env variable.
   #
 
   #
@@ -164,8 +181,8 @@ if (HIP_FOUND)
 
   if(NOT VLLM_HIP_ARCHITECTURES)
     message(FATAL_ERROR
-      "None of the detected ROCM architectures: ${CMAKE_HIP_ARCHITECTURES} is"
-      " supported. Supported ROCM architectures are: ${ROCM_SUPPORTED_ARCHS}.")
+      "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+      " supported. Supported ROCm architectures are: ${ROCM_SUPPORTED_ARCHS}.")
   endif()
 
   set(VLLM_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES})
@@ -218,31 +235,32 @@ else()
   message(DEBUG "punica nvcc: ${VLLM_PUNICA_GPU_FLAGS}")
 
   #
-  # Setup/process CUDA arch flags
+  # Setup/process CUDA arch flags.
   #
-  # The torch cmake setup detects and hardcodes the detected architecture flags
-  # in CMAKE_CUDA_FLAGS. Since CMAKE_CUDA_FLAGS is a "global" variable, we can't
-  # modify it on a per-target basis, i.e. for the punica extension.
-  # So we have to extract and remove all the '-gencode' flags from
-  # CMAKE_CUDA_FLAGS for processing. We can't use 'target_compiler_options'
-  # for adding '-gencode' arguments so we use the target's CUDA_ARCHITECTURES
-  # property instead. This requires repackaging the architecture flags into a
-  # format that cmake expects for CUDA_ARCHITECTURES.
+  # The torch cmake setup hardcodes the detected architecture flags in
+  # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
+  # can't modified on a per-target basis, e.g. for the `punica` extension.
+  # So, all the `-gencode` flags need to be extracted and removed from
+  # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
+  # Since it's not possible to use `target_compiler_options` for adding target
+  # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
+  # must be used instead.  This requires repackaging the architecture flags
+  # into a format that cmake expects for `CUDA_ARCHITECTURES`.
   #
-  # This is a bit fragile in that it depends on torch using -gencode as opposed
+  # This is a bit fragile in that it depends on torch using `-gencode` as opposed
   # to one of the other nvcc options to specify architectures.
   #
-  # Note: torch uses the TORCH_CUDA_ARCH_LIST environment variable to override
+  # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
   # detected architectures.
   #
   message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 
-  # Extract all '-gencode' flags from CMAKE_CUDA_FLAGS
+  # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
   string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS
     ${CMAKE_CUDA_FLAGS})
 
-  # Remove all '-gencode' flags from CMAKE_CUDA_FLAGS since we will be modifying
-  # them and passing them back in via the CUDA_ARCHITECTURES property.
+  # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+  # and passed back via the `CUDA_ARCHITECTURES` property.
   string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
     ${CMAKE_CUDA_FLAGS})
 
@@ -256,7 +274,7 @@ else()
 
   message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
 
-  # Macro for converting a 'gencode' version number to a cmake version number.
+  # Macro for converting a `gencode` version number to a cmake version number.
   macro(string_to_ver OUT_VER IN_STR)
     string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
   endmacro()
@@ -265,11 +283,11 @@ else()
   set(VLLM_GPU_ARCHES)
   set(VLLM_PUNICA_GPU_ARCHES)
 
-  # Process each 'gencode' flag.
+  # Process each `gencode` flag.
   foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
-    # For each flag we want to extract the version number and whether
-    # it refers to PTX or native code.
-    # Note: if a regex matches then CMAKE_MATCH_1 holds the binding
+    # For each flag, extract the version number and whether it refers to PTX
+    # or native code.
+    # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
     # for that match.
 
     string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
@@ -307,17 +325,17 @@ else()
       set(CODE_ARCH ${CODE})
     endif()
 
-    # Check if the current version is in the supported arch list
+    # Check if the current version is in the supported arch list.
     string_to_ver(CODE_VER ${CODE_ARCH})
     if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
       message(STATUS "discarding unsupported CUDA arch ${VER}.")
       continue()
     endif()
 
-    # Add it to the arch list
+    # Add it to the arch list.
     list(APPEND VLLM_GPU_ARCHES "${CODE_ARCH}${VIRT}")
 
-    # Add it to punica arch list if the version is >= 8.0
+    # Add it to punica arch list if the version is >= 8.0.
     if (CODE_VER GREATER_EQUAL 8.0)
       list(APPEND VLLM_PUNICA_GPU_ARCHES "${CODE_ARCH}${VIRT}")
     endif()
@@ -328,17 +346,17 @@ else()
 endif()
 
 #
-# Define targets
+# Define extension targets
 #
 
 #
-# Add a target named `NAME` that runs the hipify preprocessor on a set of
-# CUDA source files. The names of the corresponding "hipified" sources
-# are stored in `OUT_SRCS`.
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
 #
 macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   #
-  # Split into C++ and non-C++ (i.e. CUDA) sources
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
   #
   set(SRCS ${ORIG_SRCS})
   set(CXX_SRCS ${ORIG_SRCS})
@@ -349,7 +367,9 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   message(DEBUG "cxx sources: ${CXX_SRCS}")
 
   #
-  # Generate ROCM/HIP source file names from CUDA file names.
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
   #
   set(HIP_SRCS)
   foreach (SRC ${SRCS})
@@ -365,7 +385,7 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
     COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
     DEPENDS hipify.py ${SRCS}
     BYPRODUCTS ${HIP_SRCS}
-    COMMENT "Running hipify on extension source files.")
+    COMMENT "Running hipify on ${NAME} extension source files.")
 
   # Swap out original extension sources with hipified sources.
   set(${OUT_SRCS} ${HIP_SRCS})
@@ -382,15 +402,17 @@ endmacro()
 #                       directory.
 # MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.
 # MOD_GPU_ARCHES      - a list of target GPU architectures in cmake format.
-#                       Refer to cmake documentation on CMAKE_CUDA_ARCHITECTURES
-#                       and CMAKE_HIP_ARCHITECTURES for more info.
+#                       Refer to documentation on `CMAKE_CUDA_ARCHITECTURES`
+#                       and `CMAKE_HIP_ARCHITECTURES` for more info.
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
+# TODO: consider passing the language (CUDA/HIP/etc.) as an argument.
+#
 function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS
     MOD_GPU_ARCHES)
 
-  # Add hipify preprocessing step if we are building with HIP/ROCm.
+  # Add hipify preprocessing step when building with HIP/ROCm.
   if (NOT IS_CUDA AND HIP_FOUND)
     hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
   endif()
@@ -496,13 +518,16 @@ define_module_target(_punica_C
   "${VLLM_PUNICA_GPU_ARCHES}")
 
 #
-# Add 'supported' target which detects which extensions should be
-# built based on platform/architecture. This is the same logic that
-# setup.py uses to select which extensions should be built.
+# Add the `supported` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
 #
-# The 'supported' target makes direct use of cmake easier since knowledge
-# of which extensions are supported have been factored in, e.g.
+# The `supported` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
 #
+# mkdir build && cd build
+# cmake -G Ninja ..
 # cmake --build . --target supported
 #
 add_custom_target(supported)

From ed3f191a8b9541212e89e0b3ab36f2f97d2d23cb Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 26 Feb 2024 17:56:40 +0000
Subject: [PATCH 47/76] replace IS_CUDA with VLLM_GPU_LANG

---
 CMakeLists.txt | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8d8d573c5d04..c99bccd15d296 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,21 +119,20 @@ find_package(Torch REQUIRED)
 # torch's cmake setup.
 append_torchlib_if_found(torch_python)
 
-if ((NOT HIP_FOUND) AND (NOT CUDA_FOUND))
-  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
-endif()
-
 #
-# Check the torch version and warn if it isn't what is expected.
+# Set up GPU language and check the torch version and warn if it isn't
+# what is expected.
 #
 if (NOT HIP_FOUND AND CUDA_FOUND)
-  set(IS_CUDA true)
+  set(VLLM_GPU_LANG "CUDA")
 
   if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
     message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
       "expected for CUDA build, saw ${Torch_VERSION} instead.")
   endif()
-else()
+elseif(HIP_FOUND)
+  set(VLLM_GPU_LANG "HIP")
+
   # ROCm 5.x
   if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
       NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
@@ -147,6 +146,8 @@ else()
     message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
       "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
   endif()
+else()
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
 endif()
 
 #
@@ -407,34 +408,28 @@ endmacro()
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
-# TODO: consider passing the language (CUDA/HIP/etc.) as an argument.
-#
-function(define_module_target MOD_NAME MOD_SRC MOD_EXTRA_GPU_FLAGS
+function(define_module_target MOD_NAME MOD_GPU_LANG MOD_SRC MOD_EXTRA_GPU_FLAGS
     MOD_GPU_ARCHES)
 
   # Add hipify preprocessing step when building with HIP/ROCm.
-  if (NOT IS_CUDA AND HIP_FOUND)
+  if (MOD_GPU_LANG STREQUAL "HIP")
     hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
   endif()
 
   Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
 
-  if (IS_CUDA)
-    set(GPU_LANG "CUDA")
-  else()
-    set(GPU_LANG "HIP")
-
+  if (MOD_GPU_LANG STREQUAL "HIP")
     # Make this target dependent on the hipify preprocessor step.
     add_dependencies(${MOD_NAME} hipify${MOD_NAME})
   endif()
 
-  set_target_properties(${MOD_NAME} PROPERTIES ${GPU_LANG}_ARCHITECTURES
+  set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES
       "${MOD_GPU_ARCHES}")
 
   set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
 
   target_compile_options(${MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
+    $<$<COMPILE_LANGUAGE:${MOD_GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
 
   target_compile_definitions(${MOD_NAME} PRIVATE
     "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
@@ -463,13 +458,14 @@ set(VLLM_EXT_SRC
   "csrc/moe_align_block_size_kernels.cu"
   "csrc/pybind.cpp")
 
-if(IS_CUDA)
+if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/custom_all_reduce.cu")
 endif()
 
 define_module_target(_C
+  "${VLLM_GPU_LANG}"
   "${VLLM_EXT_SRC}"
   "${VLLM_GPU_FLAGS}"
   "${VLLM_GPU_ARCHES}")
@@ -483,6 +479,7 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/topk_softmax_kernels.cu")
 
 define_module_target(_moe_C
+  "${VLLM_GPU_LANG}"
   "${VLLM_MOE_EXT_SRC}"
   "${VLLM_GPU_FLAGS}"
   "${VLLM_GPU_ARCHES}")
@@ -513,6 +510,7 @@ set(VLLM_PUNICA_EXT_SRC
   "csrc/punica/punica_ops.cc")
 
 define_module_target(_punica_C
+  "${VLLM_GPU_LANG}"
   "${VLLM_PUNICA_EXT_SRC}"
   "${VLLM_PUNICA_GPU_FLAGS}"
   "${VLLM_PUNICA_GPU_ARCHES}")
@@ -532,12 +530,12 @@ define_module_target(_punica_C
 #
 add_custom_target(supported)
 
-if (IS_CUDA OR HIP_FOUND)
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
   add_dependencies(supported _C)
 endif()
 
-if (IS_CUDA)
+if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "Enabling moe extension.")
   add_dependencies(supported _moe_C)
 

From d9cc84032b8276020edf7793341f6fb5a7d0ce7c Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 26 Feb 2024 18:06:51 +0000
Subject: [PATCH 48/76] update comment + remove some debug logging

---
 CMakeLists.txt | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c99bccd15d296..9082a28db5be6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -232,9 +232,6 @@ else()
     "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
     "-D__CUDA_NO_HALF2_OPERATORS__")
 
-  message(DEBUG "nvcc: ${VLLM_GPU_FLAGS}")
-  message(DEBUG "punica nvcc: ${VLLM_PUNICA_GPU_FLAGS}")
-
   #
   # Setup/process CUDA arch flags.
   #
@@ -273,6 +270,7 @@ else()
       "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
   endif()
 
+  message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
   message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
 
   # Macro for converting a `gencode` version number to a cmake version number.
@@ -364,9 +362,6 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
   list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
 
-  message(DEBUG "sources: ${SRCS}")
-  message(DEBUG "cxx sources: ${CXX_SRCS}")
-
   #
   # Generate ROCm/HIP source file names from CUDA file names.
   # Since HIP files are generated code, they will appear in the build area
@@ -378,7 +373,6 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
     string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
     list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
   endforeach()
-  message(DEBUG "hip src: ${HIP_SRCS}")
 
   set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
   add_custom_target(
@@ -391,14 +385,13 @@ macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   # Swap out original extension sources with hipified sources.
   set(${OUT_SRCS} ${HIP_SRCS})
   list(APPEND ${OUT_SRCS} ${CXX_SRCS})
-
-  message(DEBUG "final ext src: ${OUT_SRCS}")
 endmacro()
 
 #
 # Define a target named `MOD_NAME` for a single extension. The
 # arguments are:
 #
+# MOD_GPU_LANG        - the GPU language for this module, e.g CUDA, HIP, etc.
 # MOD_SRC             - the list of source files relative to CMakeLists.txt
 #                       directory.
 # MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.

From 3999ed2f017fc39947ccef20fe87c0df7d966247 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 26 Feb 2024 22:50:17 +0000
Subject: [PATCH 49/76] review comments + some tweaks to setup.py

---
 CMakeLists.txt | 21 ++++++++-------------
 setup.py       | 30 +++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9082a28db5be6..f7381c3051bf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,12 +84,7 @@ macro (run_python OUT EXPR ERR_MSG)
 endmacro()
 
 #
-# Try to find the MPI package.
-#
-find_package(MPI)
-
-#
-# Update cmake's `CMAKE_PREFIX_PATH` with probably torch locations.
+# Update cmake's `CMAKE_PREFIX_PATH` with probable torch locations.
 #
 
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
@@ -509,28 +504,28 @@ define_module_target(_punica_C
   "${VLLM_PUNICA_GPU_ARCHES}")
 
 #
-# Add the `supported` target which detects which extensions should be
+# Add the `default` target which detects which extensions should be
 # built based on platform/architecture.  This is the same logic that
 # setup.py uses to select which extensions should be built and should
 # be kept in sync.
 #
-# The `supported` target makes direct use of cmake easier since knowledge
+# The `default` target makes direct use of cmake easier since knowledge
 # of which extensions are supported has been factored in, e.g.
 #
 # mkdir build && cd build
 # cmake -G Ninja ..
-# cmake --build . --target supported
+# cmake --build . --target default
 #
-add_custom_target(supported)
+add_custom_target(default)
 
 if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
-  add_dependencies(supported _C)
+  add_dependencies(default _C)
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "Enabling moe extension.")
-  add_dependencies(supported _moe_C)
+  add_dependencies(default _moe_C)
 
   set(ENABLE_PUNICA)
   # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=1 or
@@ -551,6 +546,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   if (ENABLE_PUNICA)
     message(STATUS "Enabling punica extension.")
-    add_dependencies(supported _punica_C)
+    add_dependencies(default _punica_C)
   endif()
 endif()
diff --git a/setup.py b/setup.py
index 1d0a133b8ce69..e6f1ada774a89 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,10 @@
 MAIN_CUDA_VERSION = "12.1"
 
 
+def is_sccache_available() -> bool:
+    return which("sccache") is not None
+
+
 def is_ccache_available() -> bool:
     return which("ccache") is not None
 
@@ -42,6 +46,8 @@ def __init__(self, name, cmake_lists_dir='.', **kwa):
 
 
 class cmake_build_ext(build_ext):
+    # A flag to ensure that the cmake config step only runs once.
+    did_config = False
 
     def build_extensions(self):
         # Ensure that CMake is present and working
@@ -74,7 +80,12 @@ def build_extensions(self):
             if verbose:
                 cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
 
-            if is_ccache_available():
+            if is_sccache_available():
+                cmake_args += [
+                    '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
+                    '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+                ]
+            elif is_ccache_available():
                 cmake_args += [
                     '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
                     '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
@@ -83,7 +94,11 @@ def build_extensions(self):
             #
             # Setup parallelism
             #
-            num_jobs = os.cpu_count()
+            try:
+                num_jobs = len(os.sched_getaffinity(0))
+            except AttributeError:
+                num_jobs = os.cpu_count()
+
             nvcc_cuda_version = get_nvcc_cuda_version()
             if nvcc_cuda_version >= Version("11.2"):
                 nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
@@ -107,16 +122,17 @@ def build_extensions(self):
                 build_jobs = ['-j', str(num_jobs)]
 
             # Config
-            # TODO: this only needs to happen once
-            subprocess.check_call(['cmake', ext.cmake_lists_dir] + build_tool +
-                                  cmake_args,
-                                  cwd=self.build_temp)
+            if not cmake_build_ext.did_config:
+                cmake_build_ext.did_config = True
+                subprocess.check_call(
+                    ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
+                    cwd=self.build_temp)
 
             # Build
             build_args = [
                 '--build', '.', '--config', cfg, '--target', ext_target_name
             ]
-            subprocess.check_call(['cmake'] + build_args + build_jobs,
+            subprocess.check_call(['cmake', *build_args, *build_jobs],
                                   cwd=self.build_temp)
 
 

From c669467c8cc618706062f1a0a9924f47d79f253e Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 27 Feb 2024 19:53:12 +0000
Subject: [PATCH 50/76] move utilities to utils.cmake, change find python
 process to use binary, remove unneeded user site packages from path

---
 CMakeLists.txt | 173 ++++++-------------------------------------------
 setup.py       |   7 ++
 utils.cmake    | 135 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 154 deletions(-)
 create mode 100644 utils.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7381c3051bf9..e2106d1162f7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
+include(utils.cmake)
+
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -29,76 +31,20 @@ set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
 set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
 
 #
-# Loop through all supported python versions until the first suitable version
-# that has torch installed.
-#
-# Note: cmake is unable to pick the lowest supported version when multiple
-# versions are available, even with `CMAKE_FIND_PACKAGE_SORT_ORDER`.
+# Try to find python package with an executable that exactly matches
+# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
 #
-foreach(VER ${PYTHON_SUPPORTED_VERSIONS})
-  message(STATUS "Attempting to find python ${VER} package.")
-  find_package(Python ${VER} COMPONENTS Interpreter Development.Module)
-  if (Python_FOUND)
-    # Attempt to import torch from python was just found. If torch can
-    # be imported, stop searching for other versions of python.
-    # Note: find_package(Torch) won't work here because cmake might not
-    # have the proper search path set yet.
-    execute_process(
-      COMMAND
-      "${Python_EXECUTABLE}" "-c" "import torch"
-      OUTPUT_VARIABLE PYTHON_OUT
-      RESULT_VARIABLE PYTHON_ERROR_CODE
-      ERROR_VARIABLE PYTHON_STDERR)
-
-    if(PYTHON_ERROR_CODE EQUAL 0)
-      message(STATUS
-        "Found python version ${Python_VERSION} (${Python_EXECUTABLE}).")
-      break()
-    endif()
-  endif()
-endforeach()
-
-if (NOT Python_FOUND)
+if (VLLM_PYTHON_EXECUTABLE)
+  find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
+else()
   message(FATAL_ERROR
-    "No supported version of python (with pytorch) found. "
-    "('${PYTHON_SUPPORTED_VERSIONS}')")
+    "Please set VLLM_PYTHON_EXECUTABLE to the desired python version before "
+    "running cmake configure.")
 endif()
 
 #
-# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
-# has trailing whitespace stripped.  If an error is encountered when running
-# python, a fatal message `ERR_MSG` is issued.
+# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
 #
-macro (run_python OUT EXPR ERR_MSG)
-  execute_process(
-    COMMAND
-    "${Python_EXECUTABLE}" "-c" "${EXPR}"
-    OUTPUT_VARIABLE ${OUT}
-    RESULT_VARIABLE PYTHON_ERROR_CODE
-    ERROR_VARIABLE PYTHON_STDERR
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  if(NOT PYTHON_ERROR_CODE EQUAL 0)
-    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
-  endif()
-endmacro()
-
-#
-# Update cmake's `CMAKE_PREFIX_PATH` with probable torch locations.
-#
-
-# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
-# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
-macro (append_cmake_prefix_path PKG EXPR)
-  run_python(PREFIX_PATH
-    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
-  list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
-endmacro()
-
-# Add user site-packages path to `CMAKE_PREFIX_PATH`.
-append_cmake_prefix_path("site" "site.getusersitepackages()")
-
-# Query torch for its install path and add to `CMAKE_PREFIX_PATH`.
 append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 
 #
@@ -343,93 +289,6 @@ endif()
 # Define extension targets
 #
 
-#
-# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
-# of CUDA source files. The names of the corresponding "hipified" sources are
-# stored in `OUT_SRCS`.
-#
-macro(hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
-  #
-  # Split into C++ and non-C++ (i.e. CUDA) sources.
-  #
-  set(SRCS ${ORIG_SRCS})
-  set(CXX_SRCS ${ORIG_SRCS})
-  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
-  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
-
-  #
-  # Generate ROCm/HIP source file names from CUDA file names.
-  # Since HIP files are generated code, they will appear in the build area
-  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
-  #
-  set(HIP_SRCS)
-  foreach (SRC ${SRCS})
-    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
-    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
-    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
-  endforeach()
-
-  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
-  add_custom_target(
-    hipify${NAME}
-    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
-    DEPENDS hipify.py ${SRCS}
-    BYPRODUCTS ${HIP_SRCS}
-    COMMENT "Running hipify on ${NAME} extension source files.")
-
-  # Swap out original extension sources with hipified sources.
-  set(${OUT_SRCS} ${HIP_SRCS})
-  list(APPEND ${OUT_SRCS} ${CXX_SRCS})
-endmacro()
-
-#
-# Define a target named `MOD_NAME` for a single extension. The
-# arguments are:
-#
-# MOD_GPU_LANG        - the GPU language for this module, e.g CUDA, HIP, etc.
-# MOD_SRC             - the list of source files relative to CMakeLists.txt
-#                       directory.
-# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.
-# MOD_GPU_ARCHES      - a list of target GPU architectures in cmake format.
-#                       Refer to documentation on `CMAKE_CUDA_ARCHITECTURES`
-#                       and `CMAKE_HIP_ARCHITECTURES` for more info.
-#
-# Note: optimization level/debug info is set via cmake build type.
-#
-function(define_module_target MOD_NAME MOD_GPU_LANG MOD_SRC MOD_EXTRA_GPU_FLAGS
-    MOD_GPU_ARCHES)
-
-  # Add hipify preprocessing step when building with HIP/ROCm.
-  if (MOD_GPU_LANG STREQUAL "HIP")
-    hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
-  endif()
-
-  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
-
-  if (MOD_GPU_LANG STREQUAL "HIP")
-    # Make this target dependent on the hipify preprocessor step.
-    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
-  endif()
-
-  set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES
-      "${MOD_GPU_ARCHES}")
-
-  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
-
-  target_compile_options(${MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${MOD_GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
-
-  target_compile_definitions(${MOD_NAME} PRIVATE
-    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
-
-  target_include_directories(${MOD_NAME} PRIVATE
-    csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS})
-
-  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
-
-  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION vllm)
-endfunction()
-
 #
 # _C extension
 #
@@ -452,7 +311,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/custom_all_reduce.cu")
 endif()
 
-define_module_target(_C
+define_gpu_extension_target(
+  _C
+  vllm
   "${VLLM_GPU_LANG}"
   "${VLLM_EXT_SRC}"
   "${VLLM_GPU_FLAGS}"
@@ -466,7 +327,9 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/moe_ops.cpp"
   "csrc/moe/topk_softmax_kernels.cu")
 
-define_module_target(_moe_C
+define_gpu_extension_target(
+  _moe_C
+  vllm
   "${VLLM_GPU_LANG}"
   "${VLLM_MOE_EXT_SRC}"
   "${VLLM_GPU_FLAGS}"
@@ -497,7 +360,9 @@ set(VLLM_PUNICA_EXT_SRC
   "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
   "csrc/punica/punica_ops.cc")
 
-define_module_target(_punica_C
+define_gpu_extension_target(
+  _punica_C
+  vllm
   "${VLLM_GPU_LANG}"
   "${VLLM_PUNICA_EXT_SRC}"
   "${VLLM_PUNICA_GPU_FLAGS}"
diff --git a/setup.py b/setup.py
index e6f1ada774a89..81e44dca9fb18 100644
--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,7 @@
 import os
 import re
 import subprocess
+import sys
 from typing import List
 
 from packaging.version import parse, Version
@@ -121,6 +122,12 @@ def build_extensions(self):
                 build_tool = ['-G', 'Unix Makefiles']
                 build_jobs = ['-j', str(num_jobs)]
 
+            # Pass the python executable to cmake so it can find an exact
+            # match.
+            cmake_args += [
+                '-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)
+            ]
+
             # Config
             if not cmake_build_ext.did_config:
                 cmake_build_ext.did_config = True
diff --git a/utils.cmake b/utils.cmake
new file mode 100644
index 0000000000000..6b782e6f96d4c
--- /dev/null
+++ b/utils.cmake
@@ -0,0 +1,135 @@
+#
+# Attempt to find the python pacakge that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python_EXECUTABLE ${EXECUTABLE})
+  find_package(Python COMPONENTS Interpreter Development.Module)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${VER}) is not one of the supported versions: "
+      "${SUPPORTED_VERSIONS}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+macro (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${Python_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE ${OUT}
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+endmacro()
+
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
+endmacro()
+
+#
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
+#
+macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
+  #
+  set(SRCS ${ORIG_SRCS})
+  set(CXX_SRCS ${ORIG_SRCS})
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+
+  #
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+
+  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
+  add_custom_target(
+    hipify${NAME}
+    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
+    DEPENDS hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on ${NAME} extension source files.")
+
+  # Swap out original extension sources with hipified sources.
+  set(${OUT_SRCS} ${HIP_SRCS})
+  list(APPEND ${OUT_SRCS} ${CXX_SRCS})
+endmacro()
+
+#
+# Define a target named `MOD_NAME` for a single extension. The
+# arguments are:
+#
+# MOD_DEST            - module destination directory.
+# MOD_GPU_LANG        - the GPU language for this module, e.g CUDA, HIP, etc.
+# MOD_SRC             - the list of source files relative to CMakeLists.txt
+#                       directory.
+# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.
+# MOD_GPU_ARCHES      - a list of target GPU architectures in cmake format.
+#                       Refer to documentation on `CMAKE_CUDA_ARCHITECTURES`
+#                       and `CMAKE_HIP_ARCHITECTURES` for more info.
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC
+    MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES)
+
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (MOD_GPU_LANG STREQUAL "HIP")
+    hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
+  endif()
+
+  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+
+  if (MOD_GPU_LANG STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
+  endif()
+
+  set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES
+      "${MOD_GPU_ARCHES}")
+
+  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
+
+  target_compile_options(${MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${MOD_GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
+
+  target_compile_definitions(${MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
+
+  target_include_directories(${MOD_NAME} PRIVATE
+    csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS})
+
+  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+
+  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST})
+endfunction()

From 4c9f6b0f8efc584c1a7756c50486dca8894fa9ac Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 27 Feb 2024 20:15:24 +0000
Subject: [PATCH 51/76] fix typo

---
 utils.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils.cmake b/utils.cmake
index 6b782e6f96d4c..800d371328e9f 100644
--- a/utils.cmake
+++ b/utils.cmake
@@ -1,5 +1,5 @@
 #
-# Attempt to find the python pacakge that uses the same python executable as
+# Attempt to find the python package that uses the same python executable as
 # `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
 #
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)

From 96d86ccd8ee753accfc6e04b4a348efd28a64505 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 27 Feb 2024 20:36:12 +0000
Subject: [PATCH 52/76] add path to include of utils.cmake

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2106d1162f7c..adcc56c8ce5d8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-include(utils.cmake)
+include(${CMAKE_SOURCE_DIR}/utils.cmake)
 
 #
 # Supported python versions.  These versions will be searched in order, the

From fcbd89f508ee8491d5ac5e7c1283701b08e49b4b Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 27 Feb 2024 20:41:08 +0000
Subject: [PATCH 53/76] try another path for utils.cmake

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adcc56c8ce5d8..64f53d8938d6f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
-include(${CMAKE_SOURCE_DIR}/utils.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)
 
 #
 # Supported python versions.  These versions will be searched in order, the

From 2f0ed6d8728f8b9849aebbe9d855186fdb44153b Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 27 Feb 2024 20:53:04 +0000
Subject: [PATCH 54/76] add utils.cmake to Dockerfile + MANIFEST.in

---
 Dockerfile  | 1 +
 MANIFEST.in | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 97e629dc07abb..f9f6048dfe3d9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -40,6 +40,7 @@ COPY csrc csrc
 COPY setup.py setup.py
 COPY hipify.py hipify.py
 COPY CMakeLists.txt CMakeLists.txt
+COPY utils.cmake utils.cmake
 COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
diff --git a/MANIFEST.in b/MANIFEST.in
index 38c9e58b4e73e..25087882bec72 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
 include LICENSE
 include requirements.txt
 include CMakeLists.txt
+include utils.cmake
 
 recursive-include csrc *

From 086de5c09012acc68c383964fdeaef405211cb9e Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Tue, 27 Feb 2024 21:41:47 +0000
Subject: [PATCH 55/76] remove mpi include directories

---
 utils.cmake | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utils.cmake b/utils.cmake
index 800d371328e9f..f2419bdb24a73 100644
--- a/utils.cmake
+++ b/utils.cmake
@@ -126,8 +126,7 @@ function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC
   target_compile_definitions(${MOD_NAME} PRIVATE
     "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
 
-  target_include_directories(${MOD_NAME} PRIVATE
-    csrc PRIVATE ${MPI_CXX_INCLUDE_DIRS})
+  target_include_directories(${MOD_NAME} PRIVATE csrc)
 
   target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
 

From 0ff8825a28136daba6cc4731a91e7182c18099b7 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 18:34:27 +0000
Subject: [PATCH 56/76] refactor setup.py so cmake configuration is separate
 from cmake build

---
 setup.py | 178 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 98 insertions(+), 80 deletions(-)

diff --git a/setup.py b/setup.py
index 81e44dca9fb18..ddb59a1e85dc9 100644
--- a/setup.py
+++ b/setup.py
@@ -47,8 +47,93 @@ def __init__(self, name, cmake_lists_dir='.', **kwa):
 
 
 class cmake_build_ext(build_ext):
-    # A flag to ensure that the cmake config step only runs once.
-    did_config = False
+    # A dict of extension directories that have been configured.
+    did_config = {}
+
+    #
+    # Determine number of compilation jobs and optionally nvcc compile threads.
+    #
+    def compute_num_jobs(self):
+        try:
+            num_jobs = len(os.sched_getaffinity(0))
+        except AttributeError:
+            num_jobs = os.cpu_count()
+
+        nvcc_cuda_version = get_nvcc_cuda_version()
+        if nvcc_cuda_version >= Version("11.2"):
+            nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
+            num_jobs = max(1, round(num_jobs / (nvcc_threads / 4)))
+        else:
+            nvcc_threads = None
+
+        return num_jobs, nvcc_threads
+
+    #
+    # Perform cmake configuration for a single extension.
+    #
+    def configure(self, ext):
+        # If we've already configured using the CMakeLists.txt for
+        # this extension, exit early.
+        if ext.cmake_lists_dir in cmake_build_ext.did_config:
+            return
+
+        cmake_build_ext.did_config[ext.cmake_lists_dir] = True
+
+        # Select the build type.
+        # Note: optimization level + debug info are set by the build type
+        cfg = os.getenv("CMAKE_BUILD_TYPE", "RelWithDebInfo")
+
+        # where .so files will be written, should be the same for all extensions
+        # that use the same CMakeLists.txt.
+        outdir = os.path.abspath(
+            os.path.dirname(self.get_ext_fullpath(ext.name)))
+
+        cmake_args = [
+            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
+            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
+            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
+        ]
+
+        verbose = bool(int(os.getenv('VERBOSE', '0')))
+        if verbose:
+            cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
+
+        if is_sccache_available():
+            cmake_args += [
+                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+            ]
+        elif is_ccache_available():
+            cmake_args += [
+                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
+                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
+            ]
+
+        # Pass the python executable to cmake so it can find an exact
+        # match.
+        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+
+        #
+        # Setup parallelism and build tool
+        #
+        num_jobs, nvcc_threads = self.compute_num_jobs()
+
+        if nvcc_threads:
+            cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
+
+        if is_ninja_available():
+            build_tool = ['-G', 'Ninja']
+            cmake_args += [
+                '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
+                '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
+            ]
+        else:
+            # Default build tool to whatever cmake picks.
+            build_tool = []
+
+        subprocess.check_call(
+            ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
+            cwd=self.build_temp)
 
     def build_extensions(self):
         # Ensure that CMake is present and working
@@ -57,90 +142,23 @@ def build_extensions(self):
         except OSError as e:
             raise RuntimeError('Cannot find CMake executable') from e
 
-        for ext in self.extensions:
-
-            extdir = os.path.abspath(
-                os.path.dirname(self.get_ext_fullpath(ext.name)))
-
-            # Note: optimization level + debug info set by the build type
-            cfg = os.getenv("VLLM_BUILD_TYPE", "RelWithDebInfo")
-
-            cmake_args = [
-                '-DCMAKE_BUILD_TYPE=%s' % cfg,
-                # Ask CMake to place the resulting library in the directory
-                # containing the extension
-                '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(
-                    cfg.upper(), extdir),
-                # Other intermediate static libraries are placed in a
-                # temporary build directory instead
-                '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}'.format(
-                    cfg.upper(), self.build_temp),
-            ]
+        # Create build directory if it does not exist.
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
 
-            verbose = bool(int(os.getenv('VERBOSE', '0')))
-            if verbose:
-                cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
-
-            if is_sccache_available():
-                cmake_args += [
-                    '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
-                    '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
-                ]
-            elif is_ccache_available():
-                cmake_args += [
-                    '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
-                    '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
-                ]
-
-            #
-            # Setup parallelism
-            #
-            try:
-                num_jobs = len(os.sched_getaffinity(0))
-            except AttributeError:
-                num_jobs = os.cpu_count()
-
-            nvcc_cuda_version = get_nvcc_cuda_version()
-            if nvcc_cuda_version >= Version("11.2"):
-                nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
-                num_jobs = max(1, round(num_jobs / (nvcc_threads / 4)))
-                cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
-
-            if not os.path.exists(self.build_temp):
-                os.makedirs(self.build_temp)
+        # Build all the extensions
+        for ext in self.extensions:
+            self.configure(ext)
 
             ext_target_name = remove_prefix(ext.name, "vllm.")
+            num_jobs, _ = self.compute_num_jobs()
 
-            if is_ninja_available():
-                build_tool = ['-G', 'Ninja']
-                cmake_args += [
-                    '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
-                    '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
-                ]
-                build_jobs = []
-            else:
-                build_tool = ['-G', 'Unix Makefiles']
-                build_jobs = ['-j', str(num_jobs)]
-
-            # Pass the python executable to cmake so it can find an exact
-            # match.
-            cmake_args += [
-                '-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)
-            ]
-
-            # Config
-            if not cmake_build_ext.did_config:
-                cmake_build_ext.did_config = True
-                subprocess.check_call(
-                    ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
-                    cwd=self.build_temp)
-
-            # Build
             build_args = [
-                '--build', '.', '--config', cfg, '--target', ext_target_name
+                '--build', '.', '--target', ext_target_name, '-j',
+                str(num_jobs)
             ]
-            subprocess.check_call(['cmake', *build_args, *build_jobs],
-                                  cwd=self.build_temp)
+
+            subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)
 
 
 def _is_cuda() -> bool:

From f625d6d9d4027b3ca3b22e27cc76f187829b05fa Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 19:00:22 +0000
Subject: [PATCH 57/76] let --debug control build type if CMAKE_BUILD_TYPE is
 not set

---
 CMakeLists.txt | 2 ++
 setup.py       | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 64f53d8938d6f..a490eb458b37e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,8 @@ cmake_minimum_required(VERSION 3.21)
 
 project(vllm_extensions LANGUAGES CXX)
 
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+
 include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)
 
 #
diff --git a/setup.py b/setup.py
index ddb59a1e85dc9..fad642717dcdc 100644
--- a/setup.py
+++ b/setup.py
@@ -81,7 +81,8 @@ def configure(self, ext):
 
         # Select the build type.
         # Note: optimization level + debug info are set by the build type
-        cfg = os.getenv("CMAKE_BUILD_TYPE", "RelWithDebInfo")
+        default_cfg = "Debug" if self.debug else "RelWithDebInfo"
+        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
 
         # where .so files will be written, should be the same for all extensions
         # that use the same CMakeLists.txt.

From efda6fb825dd9ea6d0212e891f9751138c3f8a1b Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 19:07:32 +0000
Subject: [PATCH 58/76] add some type annotations to setup.py

---
 setup.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index fad642717dcdc..41dd244f085c4 100644
--- a/setup.py
+++ b/setup.py
@@ -41,8 +41,8 @@ def remove_prefix(text, prefix):
 
 class CMakeExtension(Extension):
 
-    def __init__(self, name, cmake_lists_dir='.', **kwa):
-        Extension.__init__(self, name, sources=[], **kwa)
+    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
+        super().__init__(name, sources=[], **kwa)
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
 
@@ -71,7 +71,7 @@ def compute_num_jobs(self):
     #
     # Perform cmake configuration for a single extension.
     #
-    def configure(self, ext):
+    def configure(self, ext: CMakeExtension) -> None:
         # If we've already configured using the CMakeLists.txt for
         # this extension, exit early.
         if ext.cmake_lists_dir in cmake_build_ext.did_config:
@@ -136,7 +136,7 @@ def configure(self, ext):
             ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
             cwd=self.build_temp)
 
-    def build_extensions(self):
+    def build_extensions(self) -> None:
         # Ensure that CMake is present and working
         try:
             subprocess.check_output(['cmake', '--version'])

From 3fc98a60c3ecb884eb4b0045684aee0dfbc25aa8 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 19:24:45 +0000
Subject: [PATCH 59/76] add comment

---
 setup.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/setup.py b/setup.py
index 41dd244f085c4..9f24588be48a3 100644
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,8 @@ class cmake_build_ext(build_ext):
     #
     def compute_num_jobs(self):
         try:
+            # os.sched_getaffinity() isn't univerally available, so fall back
+            # to os.cpu_count() if we get an error here.
             num_jobs = len(os.sched_getaffinity(0))
         except AttributeError:
             num_jobs = os.cpu_count()

From a68fec9d6e5ed3ad37ca360a0e1b03e17f8772c1 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 20:54:53 +0000
Subject: [PATCH 60/76] review comments + move most of the arch/compiler flag
 code to utils.cmake

---
 CMakeLists.txt               | 246 ++++++----------------------
 Dockerfile                   |   3 +-
 MANIFEST.in                  |   2 +-
 hipify.py => cmake/hipify.py |   0
 cmake/utils.cmake            | 308 +++++++++++++++++++++++++++++++++++
 utils.cmake                  | 134 ---------------
 6 files changed, 362 insertions(+), 331 deletions(-)
 rename hipify.py => cmake/hipify.py (100%)
 create mode 100644 cmake/utils.cmake
 delete mode 100644 utils.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a490eb458b37e..0b1bc113b97a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@ project(vllm_extensions LANGUAGES CXX)
 
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
-include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 
 #
 # Supported python versions.  These versions will be searched in order, the
@@ -13,10 +13,10 @@ include(${CMAKE_CURRENT_LIST_DIR}/utils.cmake)
 set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 
 # Supported NVIDIA architectures.
-set(NVIDIA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(ROCM_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
@@ -40,8 +40,8 @@ if (VLLM_PYTHON_EXECUTABLE)
   find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
 else()
   message(FATAL_ERROR
-    "Please set VLLM_PYTHON_EXECUTABLE to the desired python version before "
-    "running cmake configure.")
+    "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
+    " before running cmake configure.")
 endif()
 
 #
@@ -76,6 +76,11 @@ if (NOT HIP_FOUND AND CUDA_FOUND)
 elseif(HIP_FOUND)
   set(VLLM_GPU_LANG "HIP")
 
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
+  enable_language(HIP)
+
   # ROCm 5.x
   if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
       NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
@@ -94,197 +99,26 @@ else()
 endif()
 
 #
-# Setup extra platform specific GPU compilation flags, e.g. NVCC flags for CUDA
-# and hip flags for ROCm.
-#
-# Note: CUDA (and partially HIP) is detected by pytorch package so there's no
-# need to repeat detecting it explicitly with check_language, etc.
+# Override the GPU architectures detected by cmake/torch and filter them by
+# the supported versions for the current language.
+# The final set of arches is stored in `VLLM_GPU_ARCHES`.
 #
-if (HIP_FOUND)
-  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
-  # not let cmake recognize .hip files. In order to get cmake to understand the
-  # .hip extension automatically, HIP must be enabled explicitly.
-  enable_language(HIP)
-
-  #
-  # `VLLM_HIP_ARCHITECUTRES` controls the `--offload-arch` flags.
-  # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
-  # via the `PYTORCH_ROCM_ARCH` env variable.
-  #
-
-  #
-  # Find the intersection of the supported + detected architectures to
-  # set the module architecture flags.
-  #
-  set(VLLM_HIP_ARCHITECTURES)
-  foreach (ARCH ${CMAKE_HIP_ARCHITECTURES})
-    if (ARCH IN_LIST ROCM_SUPPORTED_ARCHS)
-      list(APPEND VLLM_HIP_ARCHITECTURES ${ARCH})
-    endif()
-  endforeach()
-
-  if(NOT VLLM_HIP_ARCHITECTURES)
-    message(FATAL_ERROR
-      "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
-      " supported. Supported ROCm architectures are: ${ROCM_SUPPORTED_ARCHS}.")
-  endif()
-
-  set(VLLM_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES})
-  set(VLLM_PUNICA_GPU_ARCHES ${VLLM_HIP_ARCHITECTURES})
-
-  # Get common HIP/HIPCC flags from torch.
-  run_python(VLLM_GPU_FLAGS
-    "from torch.utils.cpp_extension import COMMON_HIP_FLAGS; print(';'.join(COMMON_HIP_FLAGS))"
-    "Failed to determine torch nvcc compiler flags")
-
-  run_python(VLLM_HIPCC_FLAGS
-    "from torch.utils.cpp_extension import COMMON_HIPCC_FLAGS; print(';'.join(COMMON_HIPCC_FLAGS))"
-    "Failed to determine torch nvcc compiler flags")
-
-  list(APPEND VLLM_GPU_FLAGS
-    ${VLLM_HIPCC_FLAGS}
-    "-DUSE_ROCM"
-    "-U__HIP_NO_HALF_CONVERSIONS__"
-    "-U__HIP_NO_HALF_OPERATORS__"
-    "-fno-gpu-rdc")
-
-else()
-  #
-  # Get common NVCC flags from torch.
-  #
-  run_python(VLLM_GPU_FLAGS
-    "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
-    "Failed to determine torch nvcc compiler flags")
-
-  if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_FP8_E5M2")
-  endif()
-
-  if(NVCC_THREADS)
-    list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
-  endif()
-
-  set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
-
-  #
-  # Copy flags+update for punica
-  #
-  list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
-    "-D__CUDA_NO_HALF_OPERATORS__"
-    "-D__CUDA_NO_HALF_CONVERSIONS__"
-    "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
-    "-D__CUDA_NO_HALF2_OPERATORS__")
-
-  #
-  # Setup/process CUDA arch flags.
-  #
-  # The torch cmake setup hardcodes the detected architecture flags in
-  # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
-  # can't modified on a per-target basis, e.g. for the `punica` extension.
-  # So, all the `-gencode` flags need to be extracted and removed from
-  # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
-  # Since it's not possible to use `target_compiler_options` for adding target
-  # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
-  # must be used instead.  This requires repackaging the architecture flags
-  # into a format that cmake expects for `CUDA_ARCHITECTURES`.
-  #
-  # This is a bit fragile in that it depends on torch using `-gencode` as opposed
-  # to one of the other nvcc options to specify architectures.
-  #
-  # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
-  # detected architectures.
-  #
-  message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-
-  # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
-  string(REGEX MATCHALL "-gencode arch=[^ ]+" VLLM_CUDA_ARCH_FLAGS
-    ${CMAKE_CUDA_FLAGS})
-
-  # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
-  # and passed back via the `CUDA_ARCHITECTURES` property.
-  string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
-    ${CMAKE_CUDA_FLAGS})
-
-  # If this error is triggered, it might mean that torch has changed how it sets
-  # up nvcc architecture code generation flags.
-  if (NOT VLLM_CUDA_ARCH_FLAGS)
-    message(FATAL_ERROR
-      "Could not find any architecture related code generation flags in "
-      "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
-  endif()
-
-  message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
-  message(DEBUG "arch flags: ${VLLM_CUDA_ARCH_FLAGS}")
-
-  # Macro for converting a `gencode` version number to a cmake version number.
-  macro(string_to_ver OUT_VER IN_STR)
-    string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
-  endmacro()
-
-  # Initialize the architecture lists to empty.
-  set(VLLM_GPU_ARCHES)
-  set(VLLM_PUNICA_GPU_ARCHES)
-
-  # Process each `gencode` flag.
-  foreach(ARCH ${VLLM_CUDA_ARCH_FLAGS})
-    # For each flag, extract the version number and whether it refers to PTX
-    # or native code.
-    # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
-    # for that match.
-
-    string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
-    if (COMPUTE)
-      set(COMPUTE ${CMAKE_MATCH_1})
-    endif()
-
-    string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH})
-    if (SM)
-      set(SM ${CMAKE_MATCH_1})
-    endif()
-
-    string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH})
-    if (CODE)
-      set(CODE ${CMAKE_MATCH_1})
-    endif()
-
-    # Make sure the virtual architecture can be matched.
-    if (NOT COMPUTE)
-      message(FATAL_ERROR
-        "Could not determine virtual architecture from: ${ARCH}.")
-    endif()
-
-    # One of sm_ or compute_ must exist.
-    if ((NOT SM) AND (NOT CODE))
-      message(FATAL_ERROR
-        "Could not determine a codegen architecture from: ${ARCH}.")
-    endif()
-
-    if (SM)
-      set(VIRT "")
-      set(CODE_ARCH ${SM})
-    else()
-      set(VIRT "-virtual")
-      set(CODE_ARCH ${CODE})
-    endif()
-
-    # Check if the current version is in the supported arch list.
-    string_to_ver(CODE_VER ${CODE_ARCH})
-    if (NOT CODE_VER IN_LIST NVIDIA_SUPPORTED_ARCHS)
-      message(STATUS "discarding unsupported CUDA arch ${VER}.")
-      continue()
-    endif()
-
-    # Add it to the arch list.
-    list(APPEND VLLM_GPU_ARCHES "${CODE_ARCH}${VIRT}")
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
 
-    # Add it to punica arch list if the version is >= 8.0.
-    if (CODE_VER GREATER_EQUAL 8.0)
-      list(APPEND VLLM_PUNICA_GPU_ARCHES "${CODE_ARCH}${VIRT}")
-    endif()
-  endforeach()
+#
+# Query torch for additional GPU compilation flags for the given
+# `VLLM_GPU_LANG`.
+# The final set of arches is stored in `VLLM_GPU_FLAGS`.
+#
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
 
-  message(DEBUG "nvcc arch: ${VLLM_GPU_ARCHES}")
-  message(DEBUG "punica arch: ${VLLM_PUNICA_GPU_ARCHES}")
+#
+# Set nvcc parallelism.
+#
+if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 
 #
@@ -362,6 +196,30 @@ set(VLLM_PUNICA_EXT_SRC
   "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
   "csrc/punica/punica_ops.cc")
 
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+
 define_gpu_extension_target(
   _punica_C
   vllm
@@ -380,7 +238,7 @@ define_gpu_extension_target(
 # of which extensions are supported has been factored in, e.g.
 #
 # mkdir build && cd build
-# cmake -G Ninja ..
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
 # cmake --build . --target default
 #
 add_custom_target(default)
diff --git a/Dockerfile b/Dockerfile
index f9f6048dfe3d9..6a56a33cfe7ac 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,9 +38,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
-COPY hipify.py hipify.py
+COPY cmake cmake
 COPY CMakeLists.txt CMakeLists.txt
-COPY utils.cmake utils.cmake
 COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
diff --git a/MANIFEST.in b/MANIFEST.in
index 25087882bec72..aa16da6500e6c 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,6 @@
 include LICENSE
 include requirements.txt
 include CMakeLists.txt
-include utils.cmake
 
+recursive-include cmake *
 recursive-include csrc *
diff --git a/hipify.py b/cmake/hipify.py
similarity index 100%
rename from hipify.py
rename to cmake/hipify.py
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
new file mode 100644
index 0000000000000..f0a73316427b7
--- /dev/null
+++ b/cmake/utils.cmake
@@ -0,0 +1,308 @@
+#
+# Attempt to find the python package that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python_EXECUTABLE ${EXECUTABLE})
+  find_package(Python COMPONENTS Interpreter Development.Module)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${VER}) is not one of the supported versions: "
+      "${SUPPORTED_VERSIONS}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+macro (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${Python_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE ${OUT}
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+endmacro()
+
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
+endmacro()
+
+#
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
+#
+macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
+  #
+  set(SRCS ${ORIG_SRCS})
+  set(CXX_SRCS ${ORIG_SRCS})
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+
+  #
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+
+  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
+  add_custom_target(
+    hipify${NAME}
+    COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
+    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on ${NAME} extension source files.")
+
+  # Swap out original extension sources with hipified sources.
+  set(${OUT_SRCS} ${HIP_SRCS})
+  list(APPEND ${OUT_SRCS} ${CXX_SRCS})
+endmacro()
+
+#
+# Get additional GPU compiler flags from torch.
+#
+macro(get_torch_gpu_compiler_flags GPU_FLAGS GPU_LANG)
+  if (${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Get common NVCC flags from torch.
+    #
+    run_python(${GPU_FLAGS}
+      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+      list(APPEND ${GPU_FLAGS} "-DENABLE_FP8_E5M2")
+    endif()
+
+  elseif(${GPU_LANG} STREQUAL "HIP")
+    #
+    # Get common HIP/HIPCC flags from torch.
+    #
+    run_python(${GPU_FLAGS}
+      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    list(APPEND ${GPU_FLAGS}
+      "-DUSE_ROCM"
+      "-U__HIP_NO_HALF_CONVERSIONS__"
+      "-U__HIP_NO_HALF_OPERATORS__"
+      "-fno-gpu-rdc")
+
+  endif()
+endmacro()
+
+# Macro for converting a `gencode` version number to a cmake version number.
+macro(string_to_ver OUT_VER IN_STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+endmacro()
+
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
+# `GPU_ARCHES`.
+#
+macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  set(GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
+  message(STATUS "${GPU_LANG} supprted arches: ${GPU_SUPPORTED_ARCHES_LIST}")
+
+  if (${GPU_LANG} STREQUAL "HIP")
+    #
+    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+    # via the `PYTORCH_ROCM_ARCH` env variable.
+    #
+
+    #
+    # Find the intersection of the supported + detected architectures to
+    # set the module architecture flags.
+    #
+    set(${GPU_ARCHES})
+    foreach (ARCH ${CMAKE_HIP_ARCHITECTURES})
+      if (ARCH IN_LIST GPU_SUPPORTED_ARCHES_LIST)
+        list(APPEND ${GPU_ARCHES} ${ARCH})
+      endif()
+    endforeach()
+
+    if(NOT ${GPU_ARCHES})
+      message(FATAL_ERROR
+        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+        " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES_LIST}.")
+    endif()
+
+  elseif(${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Setup/process CUDA arch flags.
+    #
+    # The torch cmake setup hardcodes the detected architecture flags in
+    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
+    # can't modified on a per-target basis, e.g. for the `punica` extension.
+    # So, all the `-gencode` flags need to be extracted and removed from
+    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
+    # Since it's not possible to use `target_compiler_options` for adding target
+    # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
+    # must be used instead.  This requires repackaging the architecture flags
+    # into a format that cmake expects for `CUDA_ARCHITECTURES`.
+    #
+    # This is a bit fragile in that it depends on torch using `-gencode` as opposed
+    # to one of the other nvcc options to specify architectures.
+    #
+    # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
+    # detected architectures.
+    #
+    message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # If this error is triggered, it might mean that torch has changed how it sets
+    # up nvcc architecture code generation flags.
+    if (NOT _CUDA_ARCH_FLAGS)
+      message(FATAL_ERROR
+        "Could not find any architecture related code generation flags in "
+        "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
+    endif()
+
+    message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+    message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
+
+    # Initialize the architecture lists to empty.
+    set(${GPU_ARCHES})
+
+    # Process each `gencode` flag.
+    foreach(ARCH ${_CUDA_ARCH_FLAGS})
+      # For each flag, extract the version number and whether it refers to PTX
+      # or native code.
+      # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
+      # for that match.
+
+      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
+      if (COMPUTE)
+        set(COMPUTE ${CMAKE_MATCH_1})
+      endif()
+
+      string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH})
+      if (SM)
+        set(SM ${CMAKE_MATCH_1})
+      endif()
+
+      string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH})
+      if (CODE)
+        set(CODE ${CMAKE_MATCH_1})
+      endif()
+
+      # Make sure the virtual architecture can be matched.
+      if (NOT COMPUTE)
+        message(FATAL_ERROR
+          "Could not determine virtual architecture from: ${ARCH}.")
+      endif()
+
+      # One of sm_ or compute_ must exist.
+      if ((NOT SM) AND (NOT CODE))
+        message(FATAL_ERROR
+          "Could not determine a codegen architecture from: ${ARCH}.")
+      endif()
+
+      if (SM)
+        set(VIRT "")
+        set(CODE_ARCH ${SM})
+      else()
+        set(VIRT "-virtual")
+        set(CODE_ARCH ${CODE})
+      endif()
+
+      # Check if the current version is in the supported arch list.
+      string_to_ver(CODE_VER ${CODE_ARCH})
+      if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES_LIST)
+        message(STATUS "discarding unsupported CUDA arch ${VER}.")
+        continue()
+      endif()
+
+      # Add it to the arch list.
+      list(APPEND ${GPU_ARCHES} "${CODE_ARCH}${VIRT}")
+    endforeach()
+  endif()
+  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
+endmacro()
+
+#
+# Define a target named `MOD_NAME` for a single extension. The
+# arguments are:
+#
+# MOD_DEST            - module destination directory.
+# MOD_GPU_LANG        - the GPU language for this module, e.g CUDA, HIP, etc.
+# MOD_SRC             - the list of source files relative to CMakeLists.txt
+#                       directory.
+# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.
+# MOD_GPU_ARCHES      - a list of target GPU architectures in cmake format.
+#                       Refer to documentation on `CMAKE_CUDA_ARCHITECTURES`
+#                       and `CMAKE_HIP_ARCHITECTURES` for more info.
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC
+    MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES)
+
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (MOD_GPU_LANG STREQUAL "HIP")
+    hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
+  endif()
+
+  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+
+  if (MOD_GPU_LANG STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
+  endif()
+
+  if (MOD_GPU_ARCHES)
+    set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES
+      "${MOD_GPU_ARCHES}")
+  endif()
+
+  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
+
+  target_compile_options(${MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${MOD_GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
+
+  target_compile_definitions(${MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
+
+  target_include_directories(${MOD_NAME} PRIVATE csrc)
+
+  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+
+  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST})
+endfunction()
diff --git a/utils.cmake b/utils.cmake
deleted file mode 100644
index f2419bdb24a73..0000000000000
--- a/utils.cmake
+++ /dev/null
@@ -1,134 +0,0 @@
-#
-# Attempt to find the python package that uses the same python executable as
-# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
-#
-macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
-  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
-  set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module)
-  if (NOT Python_FOUND)
-    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
-  endif()
-  set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
-  set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
-  if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST)
-    message(FATAL_ERROR
-      "Python version (${VER}) is not one of the supported versions: "
-      "${SUPPORTED_VERSIONS}.")
-  endif()
-  message(STATUS "Found python matching: ${EXECUTABLE}.")
-endmacro()
-
-#
-# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
-# has trailing whitespace stripped.  If an error is encountered when running
-# python, a fatal message `ERR_MSG` is issued.
-#
-macro (run_python OUT EXPR ERR_MSG)
-  execute_process(
-    COMMAND
-    "${Python_EXECUTABLE}" "-c" "${EXPR}"
-    OUTPUT_VARIABLE ${OUT}
-    RESULT_VARIABLE PYTHON_ERROR_CODE
-    ERROR_VARIABLE PYTHON_STDERR
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  if(NOT PYTHON_ERROR_CODE EQUAL 0)
-    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
-  endif()
-endmacro()
-
-# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
-# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
-macro (append_cmake_prefix_path PKG EXPR)
-  run_python(PREFIX_PATH
-    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
-  list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
-endmacro()
-
-#
-# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
-# of CUDA source files. The names of the corresponding "hipified" sources are
-# stored in `OUT_SRCS`.
-#
-macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
-  #
-  # Split into C++ and non-C++ (i.e. CUDA) sources.
-  #
-  set(SRCS ${ORIG_SRCS})
-  set(CXX_SRCS ${ORIG_SRCS})
-  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
-  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
-
-  #
-  # Generate ROCm/HIP source file names from CUDA file names.
-  # Since HIP files are generated code, they will appear in the build area
-  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
-  #
-  set(HIP_SRCS)
-  foreach (SRC ${SRCS})
-    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
-    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
-    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
-  endforeach()
-
-  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
-  add_custom_target(
-    hipify${NAME}
-    COMMAND ${CMAKE_SOURCE_DIR}/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
-    DEPENDS hipify.py ${SRCS}
-    BYPRODUCTS ${HIP_SRCS}
-    COMMENT "Running hipify on ${NAME} extension source files.")
-
-  # Swap out original extension sources with hipified sources.
-  set(${OUT_SRCS} ${HIP_SRCS})
-  list(APPEND ${OUT_SRCS} ${CXX_SRCS})
-endmacro()
-
-#
-# Define a target named `MOD_NAME` for a single extension. The
-# arguments are:
-#
-# MOD_DEST            - module destination directory.
-# MOD_GPU_LANG        - the GPU language for this module, e.g CUDA, HIP, etc.
-# MOD_SRC             - the list of source files relative to CMakeLists.txt
-#                       directory.
-# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.
-# MOD_GPU_ARCHES      - a list of target GPU architectures in cmake format.
-#                       Refer to documentation on `CMAKE_CUDA_ARCHITECTURES`
-#                       and `CMAKE_HIP_ARCHITECTURES` for more info.
-#
-# Note: optimization level/debug info is set via cmake build type.
-#
-function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC
-    MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES)
-
-  # Add hipify preprocessing step when building with HIP/ROCm.
-  if (MOD_GPU_LANG STREQUAL "HIP")
-    hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
-  endif()
-
-  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
-
-  if (MOD_GPU_LANG STREQUAL "HIP")
-    # Make this target dependent on the hipify preprocessor step.
-    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
-  endif()
-
-  set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES
-      "${MOD_GPU_ARCHES}")
-
-  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
-
-  target_compile_options(${MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${MOD_GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
-
-  target_compile_definitions(${MOD_NAME} PRIVATE
-    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
-
-  target_include_directories(${MOD_NAME} PRIVATE csrc)
-
-  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
-
-  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST})
-endfunction()

From 094c448613595ae40fd52246fb0a6401b37539f8 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 20:55:20 +0000
Subject: [PATCH 61/76] typo

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9f24588be48a3..69c909ea92817 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@ class cmake_build_ext(build_ext):
     #
     def compute_num_jobs(self):
         try:
-            # os.sched_getaffinity() isn't univerally available, so fall back
+            # os.sched_getaffinity() isn't universally available, so fall back
             # to os.cpu_count() if we get an error here.
             num_jobs = len(os.sched_getaffinity(0))
         except AttributeError:

From d461230ec43185b922461c87553280354f25ae82 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 20:58:34 +0000
Subject: [PATCH 62/76] utils.cmake typo

---
 cmake/utils.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index f0a73316427b7..eb43502f13e0e 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -130,7 +130,7 @@ endmacro()
 #
 macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
   set(GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
-  message(STATUS "${GPU_LANG} supprted arches: ${GPU_SUPPORTED_ARCHES_LIST}")
+  message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES_LIST}")
 
   if (${GPU_LANG} STREQUAL "HIP")
     #

From 8ff011652dfc451807d65ead6ff8fe9d002caaa9 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 28 Feb 2024 22:39:35 +0000
Subject: [PATCH 63/76] more detailed comment for libtorch_python.so

---
 CMakeLists.txt | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b1bc113b97a9..ea873c65e0010 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,9 +57,16 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
 #
 find_package(Torch REQUIRED)
 
-# For some reason torch does not add libtorch_python.so to the list of torch
-# libraries to link.  Find it by hand using `append_torchlib_if_found` from
+#
+# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+# `libtorch_python.so` for linking against an extension. Torch's cmake
+# configuration does not include this library (presumably since the cmake
+# config is used for standalone C++ binaries that link against torch).
+# The `libtorch_python.so` library defines some of the glue code between
+# torch/python via pybind and is required by VLLM extensions for this
+# reason. So, add it by manually using `append_torchlib_if_found` from
 # torch's cmake setup.
+#
 append_torchlib_if_found(torch_python)
 
 #

From 2e48dd7eb2ef7fdf8c4c576324476a207321a859 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 19:04:26 +0000
Subject: [PATCH 64/76] use cmake_parse_arguments for
 define_gpu_extension_target

---
 CMakeLists.txt    | 33 ++++++++++++----------
 cmake/utils.cmake | 71 +++++++++++++++++++++++++++++------------------
 2 files changed, 62 insertions(+), 42 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea873c65e0010..225c1832ea31a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -156,11 +156,12 @@ endif()
 
 define_gpu_extension_target(
   _C
-  vllm
-  "${VLLM_GPU_LANG}"
-  "${VLLM_EXT_SRC}"
-  "${VLLM_GPU_FLAGS}"
-  "${VLLM_GPU_ARCHES}")
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  WITH_SOABI)
 
 #
 # _moe_C extension
@@ -172,11 +173,12 @@ set(VLLM_MOE_EXT_SRC
 
 define_gpu_extension_target(
   _moe_C
-  vllm
-  "${VLLM_GPU_LANG}"
-  "${VLLM_MOE_EXT_SRC}"
-  "${VLLM_GPU_FLAGS}"
-  "${VLLM_GPU_ARCHES}")
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_MOE_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  WITH_SOABI)
 
 #
 # _punica_C extension
@@ -229,11 +231,12 @@ endif()
 
 define_gpu_extension_target(
   _punica_C
-  vllm
-  "${VLLM_GPU_LANG}"
-  "${VLLM_PUNICA_EXT_SRC}"
-  "${VLLM_PUNICA_GPU_FLAGS}"
-  "${VLLM_PUNICA_GPU_ARCHES}")
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_PUNICA_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+  WITH_SOABI)
 
 #
 # Add the `default` target which detects which extensions should be
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index eb43502f13e0e..75fba84654233 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -258,51 +258,68 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 endmacro()
 
 #
-# Define a target named `MOD_NAME` for a single extension. The
+# Define a target named `GPU_MOD_NAME` for a single extension. The
 # arguments are:
 #
-# MOD_DEST            - module destination directory.
-# MOD_GPU_LANG        - the GPU language for this module, e.g CUDA, HIP, etc.
-# MOD_SRC             - the list of source files relative to CMakeLists.txt
-#                       directory.
-# MOD_EXTRA_GPU_FLAGS - extra compiler flags passed to NVCC/hip.
-# MOD_GPU_ARCHES      - a list of target GPU architectures in cmake format.
-#                       Refer to documentation on `CMAKE_CUDA_ARCHITECTURES`
-#                       and `CMAKE_HIP_ARCHITECTURES` for more info.
+# DESTINATION <dest>         - module destination directory.
+# LANGUAGE <lang>            - the GPU language for this module, e.g CUDA, HIP,
+#                              etc.
+# SOURCES <sources>          - list of source files relative to CMakeLists.txt
+#                              directory.
+# ARCHITECTURES <arches>     - a list of target GPU architectures in cmake
+#                              format.
+#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
+#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+# COMPILE_FLAGS <flags>      - extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - extra include directories.
+# LINK_LIBRARIES <libraries> - extra link libraries.
+# WITH_SOABI                 - generate library with python SOABI suffix name.
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
-function (define_gpu_extension_target MOD_NAME MOD_DEST MOD_GPU_LANG MOD_SRC
-    MOD_EXTRA_GPU_FLAGS MOD_GPU_ARCHES)
+function (define_gpu_extension_target GPU_MOD_NAME)
+  cmake_parse_arguments(PARSE_ARGV 1
+    GPU
+    "WITH_SOABI"
+    "DESTINATION;LANGUAGE"
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
 
   # Add hipify preprocessing step when building with HIP/ROCm.
-  if (MOD_GPU_LANG STREQUAL "HIP")
-    hipify_sources_target(MOD_SRC ${MOD_NAME} "${MOD_SRC}")
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
   endif()
 
-  Python_add_library(${MOD_NAME} MODULE ${MOD_SRC} WITH_SOABI)
+  if (GPU_WITH_SOABI)
+    set(GPU_WITH_SOABI WITH_SOABI)
+  else()
+    set(GPU_WITH_SOABI)
+  endif()
+
+  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
 
-  if (MOD_GPU_LANG STREQUAL "HIP")
+  if (GPU_LANGUAGE STREQUAL "HIP")
     # Make this target dependent on the hipify preprocessor step.
-    add_dependencies(${MOD_NAME} hipify${MOD_NAME})
+    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
   endif()
 
-  if (MOD_GPU_ARCHES)
-    set_target_properties(${MOD_NAME} PROPERTIES ${MOD_GPU_LANG}_ARCHITECTURES
-      "${MOD_GPU_ARCHES}")
+  if (GPU_ARCHITECTURES)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
   endif()
 
-  set_property(TARGET ${MOD_NAME} PROPERTY CXX_STANDARD 17)
+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
 
-  target_compile_options(${MOD_NAME} PRIVATE
-    $<$<COMPILE_LANGUAGE:${MOD_GPU_LANG}>:${MOD_EXTRA_GPU_FLAGS}>)
+  target_compile_options(${GPU_MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
 
-  target_compile_definitions(${MOD_NAME} PRIVATE
-    "-DTORCH_EXTENSION_NAME=${MOD_NAME}")
+  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
 
-  target_include_directories(${MOD_NAME} PRIVATE csrc)
+  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+    ${GPU_INCLUDE_DIRECTORIES})
 
-  target_link_libraries(${MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}
+    ${GPU_LIBRARIES})
 
-  install(TARGETS ${MOD_NAME} LIBRARY DESTINATION ${MOD_DEST})
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
 endfunction()

From 5fc7a6acf7ef817be544d63ee9ea9fe75ac0704a Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 19:45:30 +0000
Subject: [PATCH 65/76] convert some macros to functions

---
 cmake/utils.cmake | 54 ++++++++++++++++++++++++-----------------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 75fba84654233..8adf37cdfcb57 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -24,11 +24,11 @@ endmacro()
 # has trailing whitespace stripped.  If an error is encountered when running
 # python, a fatal message `ERR_MSG` is issued.
 #
-macro (run_python OUT EXPR ERR_MSG)
+function (run_python OUT EXPR ERR_MSG)
   execute_process(
     COMMAND
     "${Python_EXECUTABLE}" "-c" "${EXPR}"
-    OUTPUT_VARIABLE ${OUT}
+    OUTPUT_VARIABLE PYTHON_OUT
     RESULT_VARIABLE PYTHON_ERROR_CODE
     ERROR_VARIABLE PYTHON_STDERR
     OUTPUT_STRIP_TRAILING_WHITESPACE)
@@ -36,7 +36,8 @@ macro (run_python OUT EXPR ERR_MSG)
   if(NOT PYTHON_ERROR_CODE EQUAL 0)
     message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
   endif()
-endmacro()
+  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
 
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
 # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
@@ -51,7 +52,7 @@ endmacro()
 # of CUDA source files. The names of the corresponding "hipified" sources are
 # stored in `OUT_SRCS`.
 #
-macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
   #
   # Split into C++ and non-C++ (i.e. CUDA) sources.
   #
@@ -81,42 +82,43 @@ macro (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
     COMMENT "Running hipify on ${NAME} extension source files.")
 
   # Swap out original extension sources with hipified sources.
-  set(${OUT_SRCS} ${HIP_SRCS})
-  list(APPEND ${OUT_SRCS} ${CXX_SRCS})
-endmacro()
+  list(APPEND HIP_SRCS ${CXX_SRCS})
+  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
+endfunction()
 
 #
 # Get additional GPU compiler flags from torch.
 #
-macro(get_torch_gpu_compiler_flags GPU_FLAGS GPU_LANG)
+function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
   if (${GPU_LANG} STREQUAL "CUDA")
     #
     # Get common NVCC flags from torch.
     #
-    run_python(${GPU_FLAGS}
+    run_python(GPU_FLAGS
       "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
       "Failed to determine torch nvcc compiler flags")
 
     if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
-      list(APPEND ${GPU_FLAGS} "-DENABLE_FP8_E5M2")
+      list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
     endif()
 
   elseif(${GPU_LANG} STREQUAL "HIP")
     #
     # Get common HIP/HIPCC flags from torch.
     #
-    run_python(${GPU_FLAGS}
+    run_python(GPU_FLAGS
       "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
       "Failed to determine torch nvcc compiler flags")
 
-    list(APPEND ${GPU_FLAGS}
+    list(APPEND GPU_FLAGS
       "-DUSE_ROCM"
       "-U__HIP_NO_HALF_CONVERSIONS__"
       "-U__HIP_NO_HALF_OPERATORS__"
       "-fno-gpu-rdc")
 
   endif()
-endmacro()
+  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
+endfunction()
 
 # Macro for converting a `gencode` version number to a cmake version number.
 macro(string_to_ver OUT_VER IN_STR)
@@ -128,9 +130,8 @@ endmacro()
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
 # `GPU_ARCHES`.
 #
-macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
-  set(GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
-  message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES_LIST}")
+function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES}")
 
   if (${GPU_LANG} STREQUAL "HIP")
     #
@@ -143,17 +144,17 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
-    set(${GPU_ARCHES})
+    set(GPU_ARCHES)
     foreach (ARCH ${CMAKE_HIP_ARCHITECTURES})
-      if (ARCH IN_LIST GPU_SUPPORTED_ARCHES_LIST)
-        list(APPEND ${GPU_ARCHES} ${ARCH})
+      if (ARCH IN_LIST GPU_SUPPORTED_ARCHES)
+        list(APPEND GPU_ARCHES ${ARCH})
       endif()
     endforeach()
 
-    if(NOT ${GPU_ARCHES})
+    if(NOT GPU_ARCHES)
       message(FATAL_ERROR
         "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
-        " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES_LIST}.")
+        " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES}.")
     endif()
 
   elseif(${GPU_LANG} STREQUAL "CUDA")
@@ -199,7 +200,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
 
     # Initialize the architecture lists to empty.
-    set(${GPU_ARCHES})
+    set(GPU_ARCHES)
 
     # Process each `gencode` flag.
     foreach(ARCH ${_CUDA_ARCH_FLAGS})
@@ -245,17 +246,18 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
 
       # Check if the current version is in the supported arch list.
       string_to_ver(CODE_VER ${CODE_ARCH})
-      if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES_LIST)
+      if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES)
         message(STATUS "discarding unsupported CUDA arch ${VER}.")
         continue()
       endif()
 
       # Add it to the arch list.
-      list(APPEND ${GPU_ARCHES} "${CODE_ARCH}${VIRT}")
+      list(APPEND GPU_ARCHES "${CODE_ARCH}${VIRT}")
     endforeach()
   endif()
-  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
-endmacro()
+  message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}")
+  set(${OUT_GPU_ARCHES} ${GPU_ARCHES} PARENT_SCOPE)
+endfunction()
 
 #
 # Define a target named `GPU_MOD_NAME` for a single extension. The

From baa1fa83b51351d93ecfcdaa27fc6e385de4c163 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 20:48:49 +0000
Subject: [PATCH 66/76] use underscores for variables set in macros

---
 cmake/utils.cmake | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 8adf37cdfcb57..56b3b2f0da059 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -9,12 +9,12 @@ macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
   if (NOT Python_FOUND)
     message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
   endif()
-  set(VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
-  set(SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
-  if (NOT VER IN_LIST SUPPORTED_VERSIONS_LIST)
+  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
     message(FATAL_ERROR
-      "Python version (${VER}) is not one of the supported versions: "
-      "${SUPPORTED_VERSIONS}.")
+      "Python version (${_VER}) is not one of the supported versions: "
+      "${_SUPPORTED_VERSIONS_LIST}.")
   endif()
   message(STATUS "Found python matching: ${EXECUTABLE}.")
 endmacro()
@@ -42,9 +42,9 @@ endfunction()
 # Run `EXPR` in python after importing `PKG`. Use the result of this to extend
 # `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
 macro (append_cmake_prefix_path PKG EXPR)
-  run_python(PREFIX_PATH
+  run_python(_PREFIX_PATH
     "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
-  list(APPEND CMAKE_PREFIX_PATH ${PREFIX_PATH})
+  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
 endmacro()
 
 #

From 5c0bd3238e1da21b3b318b5cd28ddfdf7d1c6b82 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 21:37:48 +0000
Subject: [PATCH 67/76] turn override_gpu_arches back into a macro

---
 CMakeLists.txt    |  2 +-
 cmake/utils.cmake | 68 ++++++++++++++++++++++++-----------------------
 setup.py          |  2 +-
 3 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 225c1832ea31a..dfb1bef0ed5a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -273,7 +273,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       if (ARCH_VER VERSION_LESS 8.0)
         message(STATUS
           "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.")
-        set(ENABLE_PUNICA false)
+#        set(ENABLE_PUNICA false)
         break()
       endif()
     endforeach()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 56b3b2f0da059..686eea878632f 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -130,8 +130,11 @@ endmacro()
 # `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
 # `GPU_ARCHES`.
 #
-function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
-  message(STATUS "${GPU_LANG} supported arches: ${GPU_SUPPORTED_ARCHES}")
+# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
+#
+macro(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
+  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
 
   if (${GPU_LANG} STREQUAL "HIP")
     #
@@ -144,17 +147,17 @@ function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     # Find the intersection of the supported + detected architectures to
     # set the module architecture flags.
     #
-    set(GPU_ARCHES)
-    foreach (ARCH ${CMAKE_HIP_ARCHITECTURES})
-      if (ARCH IN_LIST GPU_SUPPORTED_ARCHES)
-        list(APPEND GPU_ARCHES ${ARCH})
+    set(${GPU_ARCHES})
+    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
+      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        list(APPEND ${GPU_ARCHES} ${_ARCH})
       endif()
     endforeach()
 
-    if(NOT GPU_ARCHES)
+    if(NOT ${GPU_ARCHES})
       message(FATAL_ERROR
         "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
-        " supported. Supported ROCm architectures are: ${GPU_SUPPORTED_ARCHES}.")
+        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
     endif()
 
   elseif(${GPU_LANG} STREQUAL "CUDA")
@@ -200,63 +203,62 @@ function(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
 
     # Initialize the architecture lists to empty.
-    set(GPU_ARCHES)
+    set(${GPU_ARCHES})
 
     # Process each `gencode` flag.
-    foreach(ARCH ${_CUDA_ARCH_FLAGS})
+    foreach(_ARCH ${_CUDA_ARCH_FLAGS})
       # For each flag, extract the version number and whether it refers to PTX
       # or native code.
       # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
       # for that match.
 
-      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" COMPUTE ${ARCH})
-      if (COMPUTE)
-        set(COMPUTE ${CMAKE_MATCH_1})
+      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+      if (_COMPUTE)
+        set(_COMPUTE ${CMAKE_MATCH_1})
       endif()
 
-      string(REGEX MATCH "code=sm_\([0-9]+a?\)" SM ${ARCH})
-      if (SM)
-        set(SM ${CMAKE_MATCH_1})
+      string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
+      if (_SM)
+        set(_SM ${CMAKE_MATCH_1})
       endif()
 
-      string(REGEX MATCH "code=compute_\([0-9]+a?\)" CODE ${ARCH})
-      if (CODE)
-        set(CODE ${CMAKE_MATCH_1})
+      string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
+      if (_CODE)
+        set(_CODE ${CMAKE_MATCH_1})
       endif()
 
       # Make sure the virtual architecture can be matched.
-      if (NOT COMPUTE)
+      if (NOT _COMPUTE)
         message(FATAL_ERROR
-          "Could not determine virtual architecture from: ${ARCH}.")
+          "Could not determine virtual architecture from: ${_ARCH}.")
       endif()
 
       # One of sm_ or compute_ must exist.
-      if ((NOT SM) AND (NOT CODE))
+      if ((NOT _SM) AND (NOT _CODE))
         message(FATAL_ERROR
-          "Could not determine a codegen architecture from: ${ARCH}.")
+          "Could not determine a codegen architecture from: ${_ARCH}.")
       endif()
 
-      if (SM)
-        set(VIRT "")
-        set(CODE_ARCH ${SM})
+      if (_SM)
+        set(_VIRT "")
+        set(_CODE_ARCH ${_SM})
       else()
-        set(VIRT "-virtual")
-        set(CODE_ARCH ${CODE})
+        set(_VIRT "-virtual")
+        set(_CODE_ARCH ${_CODE})
       endif()
 
       # Check if the current version is in the supported arch list.
-      string_to_ver(CODE_VER ${CODE_ARCH})
-      if (NOT CODE_VER IN_LIST GPU_SUPPORTED_ARCHES)
-        message(STATUS "discarding unsupported CUDA arch ${VER}.")
+      string_to_ver(_CODE_VER ${_CODE_ARCH})
+      if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        message(STATUS "discarding unsupported CUDA arch ${_VER}.")
         continue()
       endif()
 
       # Add it to the arch list.
-      list(APPEND GPU_ARCHES "${CODE_ARCH}${VIRT}")
+      list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
     endforeach()
   endif()
   message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}")
-  set(${OUT_GPU_ARCHES} ${GPU_ARCHES} PARENT_SCOPE)
 endfunction()
 
 #
diff --git a/setup.py b/setup.py
index 69c909ea92817..2a412d24c4b17 100644
--- a/setup.py
+++ b/setup.py
@@ -191,7 +191,7 @@ def _install_punica() -> bool:
     for i in range(device_count):
         major, minor = torch.cuda.get_device_capability(i)
         if major < 8:
-            install_punica = False
+#            install_punica = False
             break
     return install_punica
 

From 34e1f00045b1143fb0ab4e7bf14ca091f20bf63f Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 21:40:55 +0000
Subject: [PATCH 68/76] fix override_gpu_arches

---
 cmake/utils.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 686eea878632f..e3b994b8fa61b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -132,7 +132,7 @@ endmacro()
 #
 # Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
 #
-macro(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
   set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
   message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
 
@@ -259,7 +259,7 @@ macro(override_gpu_arches OUT_GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
     endforeach()
   endif()
   message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}")
-endfunction()
+endmacro()
 
 #
 # Define a target named `GPU_MOD_NAME` for a single extension. The

From 384d897806ee2237237593f159de7bbd1e8df1f0 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 21:53:21 +0000
Subject: [PATCH 69/76] fixes for punica configuration

---
 CMakeLists.txt    | 8 ++++----
 cmake/utils.cmake | 2 +-
 setup.py          | 5 ++++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dfb1bef0ed5a0..82bfcd68d11d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,17 +263,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   add_dependencies(default _moe_C)
 
   set(ENABLE_PUNICA)
-  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=1 or
+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
   # VLLM_INSTALL_PUNICA_KERNELS is set in the environment.
-  if (DEFINED VLLM_INSTALL_PUNICA_KERNELS OR ENV{VLLM_INSTALL_PUNICA_KERNELS})
-    set(ENABLE_PUNICA true)
+  if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS)
+    set(ENABLE_PUNICA ON)
     foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
       string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH})
       string_to_ver(ARCH_VER ${ARCH_VER_STR})
       if (ARCH_VER VERSION_LESS 8.0)
         message(STATUS
           "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.")
-#        set(ENABLE_PUNICA false)
+        set(ENABLE_PUNICA OFF)
         break()
       endif()
     endforeach()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index e3b994b8fa61b..a50f0e1e5a76f 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -258,7 +258,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
       list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
     endforeach()
   endif()
-  message(STATUS "${GPU_LANG} target arches: ${GPU_ARCHES}")
+  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
 endmacro()
 
 #
diff --git a/setup.py b/setup.py
index 2a412d24c4b17..a71d3ece7a282 100644
--- a/setup.py
+++ b/setup.py
@@ -116,6 +116,9 @@ def configure(self, ext: CMakeExtension) -> None:
         # match.
         cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
 
+        if _install_punica():
+            cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
+
         #
         # Setup parallelism and build tool
         #
@@ -191,7 +194,7 @@ def _install_punica() -> bool:
     for i in range(device_count):
         major, minor = torch.cuda.get_device_capability(i)
         if major < 8:
-#            install_punica = False
+            install_punica = False
             break
     return install_punica
 

From 59204f69c65b99efabdaef6c56110d229dd17b4e Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 22:06:47 +0000
Subject: [PATCH 70/76] debugging

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82bfcd68d11d0..874477ddb3ce5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -267,8 +267,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # VLLM_INSTALL_PUNICA_KERNELS is set in the environment.
   if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS)
     set(ENABLE_PUNICA ON)
+    message(STATUS "native arches: ${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
     foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
       string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH})
+      message(STATUS "ARCH_VER_STR: ${ARCH_VER_STR}")
       string_to_ver(ARCH_VER ${ARCH_VER_STR})
       if (ARCH_VER VERSION_LESS 8.0)
         message(STATUS

From ec6ae13068c750c27d25db4bb7e8fc975e5ee5e3 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 22:11:49 +0000
Subject: [PATCH 71/76] debug

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 874477ddb3ce5..65c25c4b4660e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -267,6 +267,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # VLLM_INSTALL_PUNICA_KERNELS is set in the environment.
   if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS)
     set(ENABLE_PUNICA ON)
+    message(STATUS "arches: ${CMAKE_CUDA_ARCHITECTURES}")
     message(STATUS "native arches: ${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
     foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
       string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH})

From fa42ea9800272b27b108c4ff7cda646e4a78a9c0 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Thu, 29 Feb 2024 22:31:19 +0000
Subject: [PATCH 72/76] revamp punica installation logic

---
 CMakeLists.txt | 45 +++++++++++++++++----------------------------
 setup.py       | 10 ++--------
 2 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65c25c4b4660e..770b830637649 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -229,14 +229,19 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
   message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
 endif()
 
-define_gpu_extension_target(
-  _punica_C
-  DESTINATION vllm
-  LANGUAGE ${VLLM_GPU_LANG}
-  SOURCES ${VLLM_PUNICA_EXT_SRC}
-  COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
-  ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
-  WITH_SOABI)
+if (VLLM_PUNICA_GPU_ARCHES)
+  define_gpu_extension_target(
+    _punica_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    WITH_SOABI)
+else()
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
+endif()
 
 #
 # Add the `default` target which detects which extensions should be
@@ -262,27 +267,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "Enabling moe extension.")
   add_dependencies(default _moe_C)
 
-  set(ENABLE_PUNICA)
   # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
-  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment.
-  if (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS)
-    set(ENABLE_PUNICA ON)
-    message(STATUS "arches: ${CMAKE_CUDA_ARCHITECTURES}")
-    message(STATUS "native arches: ${CMAKE_CUDA_ARCHITECTURES_NATIVE}")
-    foreach (ARCH ${CMAKE_CUDA_ARCHITECTURES_NATIVE})
-      string(REGEX MATCH "\([0-9]+\)" ARCH_VER_STR ${ARCH})
-      message(STATUS "ARCH_VER_STR: ${ARCH_VER_STR}")
-      string_to_ver(ARCH_VER ${ARCH_VER_STR})
-      if (ARCH_VER VERSION_LESS 8.0)
-        message(STATUS
-          "Unable to add punica extension due to device version ${ARCH_VER} < 8.0.")
-        set(ENABLE_PUNICA OFF)
-        break()
-      endif()
-    endforeach()
-  endif()
-
-  if (ENABLE_PUNICA)
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
     message(STATUS "Enabling punica extension.")
     add_dependencies(default _punica_C)
   endif()
diff --git a/setup.py b/setup.py
index a71d3ece7a282..aecc8a8c54bd4 100644
--- a/setup.py
+++ b/setup.py
@@ -189,14 +189,8 @@ def _is_cuda() -> bool:
 
 
 def _install_punica() -> bool:
-    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
-    device_count = torch.cuda.device_count()
-    for i in range(device_count):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 8:
-            install_punica = False
-            break
-    return install_punica
+    print(f"DEBUG device_count = {torch.cuda.device_count()}")
+    return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
 
 def get_path(*filepath) -> str:

From c3be2cf238d36be359d3e4cb749fba068dbd4b5c Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 6 Mar 2024 18:41:24 +0000
Subject: [PATCH 73/76] remove debugging cruft + add more detail to comment

---
 cmake/utils.cmake | 21 +++++++++++++--------
 setup.py          |  1 -
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index a50f0e1e5a76f..bb222bb437b1d 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -265,19 +265,24 @@ endmacro()
 # Define a target named `GPU_MOD_NAME` for a single extension. The
 # arguments are:
 #
-# DESTINATION <dest>         - module destination directory.
-# LANGUAGE <lang>            - the GPU language for this module, e.g CUDA, HIP,
+# DESTINATION <dest>         - Module destination directory.
+# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
 #                              etc.
-# SOURCES <sources>          - list of source files relative to CMakeLists.txt
+# SOURCES <sources>          - List of source files relative to CMakeLists.txt
 #                              directory.
-# ARCHITECTURES <arches>     - a list of target GPU architectures in cmake
+#
+# Optional arguments:
+#
+# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
 #                              format.
 #                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
 #                              and `CMAKE_HIP_ARCHITECTURES` for more info.
-# COMPILE_FLAGS <flags>      - extra compiler flags passed to NVCC/hip.
-# INCLUDE_DIRECTORIES <dirs> - extra include directories.
-# LINK_LIBRARIES <libraries> - extra link libraries.
-# WITH_SOABI                 - generate library with python SOABI suffix name.
+#                              ARCHITECTURES will use cmake's defaults if
+#                              not provided.
+# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+# LINK_LIBRARIES <libraries> - Extra link libraries.
+# WITH_SOABI                 - Generate library with python SOABI suffix name.
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
diff --git a/setup.py b/setup.py
index aecc8a8c54bd4..4309136c5b460 100644
--- a/setup.py
+++ b/setup.py
@@ -189,7 +189,6 @@ def _is_cuda() -> bool:
 
 
 def _install_punica() -> bool:
-    print(f"DEBUG device_count = {torch.cuda.device_count()}")
     return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
 

From 2225ece7e8b4ba834e6c67a45028db38d596f3c3 Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 6 Mar 2024 18:46:57 +0000
Subject: [PATCH 74/76] merge marlin changes

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 770b830637649..29a531d44a9d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,6 +151,7 @@ set(VLLM_EXT_SRC
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
     "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/quantization/marlin/marlin_cuda_kernel.cu"
     "csrc/custom_all_reduce.cu")
 endif()
 

From 5393d4cd3ff0b5765eb316918879abccb34ba83e Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Wed, 13 Mar 2024 19:09:22 +0000
Subject: [PATCH 75/76] fix merge error

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4309136c5b460..b31097b69f24d 100644
--- a/setup.py
+++ b/setup.py
@@ -308,7 +308,7 @@ def get_requirements() -> List[str]:
     if _is_cuda():
         with open(get_path("requirements.txt")) as f:
             requirements = f.read().strip().split("\n")
-        if nvcc_cuda_version <= Version("11.8"):
+        if get_nvcc_cuda_version() <= Version("11.8"):
             # replace cupy-cuda12x with cupy-cuda11x for cuda 11.x
             for i in range(len(requirements)):
                 if requirements[i].startswith("cupy-cuda12x"):

From af254ce18b2f2dddb83e84a4e5c7d03f99e2e4da Mon Sep 17 00:00:00 2001
From: Bill Nell <bill@neuralmagic.com>
Date: Mon, 18 Mar 2024 15:22:56 +0000
Subject: [PATCH 76/76] merge setup.py

---
 setup.py | 45 ++++++++++++++++++---------------------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/setup.py b/setup.py
index b31097b69f24d..88787334be21a 100644
--- a/setup.py
+++ b/setup.py
@@ -184,31 +184,10 @@ def _is_neuron() -> bool:
     return torch_neuronx_installed
 
 
-def _is_cuda() -> bool:
-    return (torch.version.cuda is not None) and not _is_neuron()
-
-
 def _install_punica() -> bool:
     return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
 
 
-def get_path(*filepath) -> str:
-    return os.path.join(ROOT_DIR, *filepath)
-
-
-def find_version(filepath: str) -> str:
-    """Extract version information from the given filepath.
-
-    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
-    """
-    with open(filepath) as fp:
-        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
-                                  fp.read(), re.M)
-        if version_match:
-            return version_match.group(1)
-        raise RuntimeError("Unable to find version string.")
-
-
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
     result = subprocess.run(['hipcc', '--version'],
@@ -263,11 +242,28 @@ def get_nvcc_cuda_version() -> Version:
     return nvcc_cuda_version
 
 
+def get_path(*filepath) -> str:
+    return os.path.join(ROOT_DIR, *filepath)
+
+
+def find_version(filepath: str) -> str:
+    """Extract version information from the given filepath.
+
+    Adapted from https://github.com/ray-project/ray/blob/0b190ee1160eeca9796bc091e07eaebf4c85b511/python/setup.py
+    """
+    with open(filepath) as fp:
+        version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                                  fp.read(), re.M)
+        if version_match:
+            return version_match.group(1)
+        raise RuntimeError("Unable to find version string.")
+
+
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
 
     if _is_cuda():
-        cuda_version = str(nvcc_cuda_version)
+        cuda_version = str(get_nvcc_cuda_version())
         if cuda_version != MAIN_CUDA_VERSION:
             cuda_version_str = cuda_version.replace(".", "")[:3]
             version += f"+cu{cuda_version_str}"
@@ -283,11 +279,6 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
-    elif _is_cuda():
-        cuda_version = str(get_nvcc_cuda_version())
-        if cuda_version != MAIN_CUDA_VERSION:
-            cuda_version_str = cuda_version.replace(".", "")[:3]
-            version += f"+cu{cuda_version_str}"
     else:
         raise RuntimeError("Unknown runtime environment")