Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hardware][Intel] Integrate CPU backend build procedure #2753

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ cmake_minimum_required(VERSION 3.21)

project(vllm_extensions LANGUAGES CXX)

option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cpu")

message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)

Expand Down Expand Up @@ -69,6 +72,19 @@ find_package(Torch REQUIRED)
#
append_torchlib_if_found(torch_python)

#
# Forward the non-CUDA device extensions to external CMake scripts.
#
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
else()
message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
endif()
return()
endif()

#
# Set up GPU language and check the torch version and warn if it isn't
# what is expected.
Expand Down
77 changes: 77 additions & 0 deletions cmake/cpu_extension.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#
# Check the compile flags
#
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
"-DVLLM_CPU_EXTENSION")

execute_process(COMMAND cat /proc/cpuinfo
RESULT_VARIABLE CPUINFO_RET
OUTPUT_VARIABLE CPUINFO)

if (NOT CPUINFO_RET EQUAL 0)
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
endif()

function (find_isa CPUINFO TARGET OUT)
string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
if(NOT ISA_FOUND EQUAL -1)
set(${OUT} ON PARENT_SCOPE)
else()
set(${OUT} OFF PARENT_SCOPE)
endif()
endfunction()

find_isa(${CPUINFO} "avx512f" AVX512_FOUND)

if (AVX512_FOUND)
list(APPEND CXX_COMPILE_FLAGS
"-mavx512f"
"-mavx512vl"
"-mavx512bw"
"-mavx512dq")

find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
if (AVX512BF16_FOUND AND
CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
else()
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
endif()
else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
endif()

message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")

message(FATAL_ERROR "vLLM CPU backend is unavailable")

#
# Define extension targets
#

#
# _C extension
#
set(VLLM_EXT_SRC
"csrc/cpu/activation.cpp"
"csrc/cpu/attention.cpp"
"csrc/cpu/cache.cpp"
"csrc/cpu/layernorm.cpp"
"csrc/cpu/pos_encoding.cpp"
"csrc/pybind.cpp")

define_gpu_extension_target(
_C
DESTINATION vllm
LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
WITH_SOABI
)

add_custom_target(default)
message(STATUS "Enabling C extension.")
add_dependencies(default _C)

19 changes: 15 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from torch.utils.cpp_extension import CUDA_HOME

ROOT_DIR = os.path.dirname(__file__)
# Target device of vLLM, supporting [cuda (by default), rocm, neuron]
VLLM_TARGET_DEVICE = os.getenv("VLLM_TARGET_DEVICE", "cuda")

# vLLM only supports Linux platform
assert sys.platform.startswith(
Expand Down Expand Up @@ -61,8 +63,7 @@ def compute_num_jobs(self):
except AttributeError:
num_jobs = os.cpu_count()

nvcc_cuda_version = get_nvcc_cuda_version()
if nvcc_cuda_version >= Version("11.2"):
if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
num_jobs = max(1, round(num_jobs / (nvcc_threads / 4)))
else:
Expand Down Expand Up @@ -95,6 +96,7 @@ def configure(self, ext: CMakeExtension) -> None:
'-DCMAKE_BUILD_TYPE={}'.format(cfg),
'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
'-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
'-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
]

verbose = bool(int(os.getenv('VERBOSE', '0')))
Expand Down Expand Up @@ -168,11 +170,12 @@ def build_extensions(self) -> None:


def _is_cuda() -> bool:
return torch.version.cuda is not None
return VLLM_TARGET_DEVICE == "cuda" and torch.version.cuda is not None


def _is_hip() -> bool:
return torch.version.hip is not None
return (VLLM_TARGET_DEVICE == "cuda"
or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None


def _is_neuron() -> bool:
Expand All @@ -184,6 +187,10 @@ def _is_neuron() -> bool:
return torch_neuronx_installed


def _is_cpu() -> bool:
return VLLM_TARGET_DEVICE == "cpu"


def _install_punica() -> bool:
return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))

Expand Down Expand Up @@ -279,6 +286,8 @@ def get_vllm_version() -> str:
if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}"
elif _is_cpu():
version += "+cpu"
else:
raise RuntimeError("Unknown runtime environment")

Expand Down Expand Up @@ -311,6 +320,8 @@ def get_requirements() -> List[str]:
elif _is_neuron():
with open(get_path("requirements-neuron.txt")) as f:
requirements = f.read().strip().split("\n")
elif _is_cpu():
requirements = []
else:
raise ValueError(
"Unsupported platform, please use CUDA, ROCM or Neuron.")
Expand Down
Loading