Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CMake and Windows Compilation #788

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# This CMake config hopefully makes it easier to compile.
# Ensure the CUDA Toolkit is available on your path. Then run:
# For GCC: `cmake -B build . && cmake --build build`
# For MSVC: `cmake -B build . && cmake --build build --config Release`
# You can also use the following options
# - BUILD_CUDA: Default ON, will build with CUDA
# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
# is whatever CMake finds on your path.
# - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC.
# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90`
# Check your compute capability here: https://developer.nvidia.com/cuda-gpus
# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
cmake_minimum_required(VERSION 3.18)

project(bitsandbytes LANGUAGES C CXX)

option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON)
option(NO_CUBLASLT "Disable CUBLAS" OFF)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)

list(APPEND SRC_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu)

message(STATUS "BUILD_CUDA := ${BUILD_CUDA}")
message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")

set(BNB_OUTPUT_NAME "libbitsandbytes")

if(BUILD_CUDA)
enable_language(CUDA) # This will fail if CUDA is not found

# Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")
string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}")

# Expose a cache variable that the user can set to ensure the correct version of CUDA is found
set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode")

message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})")
message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")

# It should match the discovered version
if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}")
message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}."
" Ensure the desired CUDA compiler is the first one available on your PATH."
)
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
message(FATAL_ERROR "CUDA Version < 11 is not supported")
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
message(FATAL_ERROR "CUDA Version > 12 is not supported")
endif()

string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
if(PTXAS_VERBOSE)
# Verbose? Outputs register usage information, and other things...
string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
endif()

foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL})
# Most of the items here are like: `xx-real`, so we just extract the `xx` portion
string(REGEX MATCH "[0-9]+" capability_id "${capability}")
if(capability_id GREATER 0)
list(APPEND POSSIBLE_CAPABILITIES ${capability_id})
endif()
endforeach()

# This can be changed via -D argument to CMake
# By default all possible capabilities are compiled
set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted")

message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}")

foreach(capability ${COMPUTE_CAPABILITY})
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}")
endforeach()

message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")

list(APPEND SRC_FILES ${CUDA_FILES})

string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
if(NO_CUBLASLT)
string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
endif()
else()
message(STATUS "Building CPU Only")
string(APPEND BNB_OUTPUT_NAME "_cpu")
if(NO_CUBLASLT)
message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.")
endif()
endif()

add_library(libbitsandbytes SHARED ${SRC_FILES})
target_include_directories(libbitsandbytes PUBLIC csrc include)
target_compile_features(libbitsandbytes PUBLIC cxx_std_14)


if(BUILD_CUDA)
target_compile_definitions(libbitsandbytes PUBLIC BUILD_CUDA)
target_link_libraries(libbitsandbytes PUBLIC cudart cublas cusparse)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should be using https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html#module:FindCUDAToolkit and the targets associacted with it

if(NO_CUBLASLT)
target_compile_definitions(libbitsandbytes PUBLIC NO_CUBLASLT)
else()
target_link_libraries(libbitsandbytes PUBLIC cublasLt)
endif()

set_target_properties(libbitsandbytes
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
)
endif()

set_target_properties(libbitsandbytes
PROPERTIES
OUTPUT_NAME ${BNB_OUTPUT_NAME}
# We have to use a generator expression to prevent MSVC Debug/Release subdirs being made
RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers
WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports
)
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ CUDA_VERSION=117 make cuda11x
python setup.py install
```

On Windows you *must* compile it from source. See [compile_from_source](./compile_from_source.md).

**Using Int8 inference with HuggingFace Transformers**

```python
Expand Down
25 changes: 16 additions & 9 deletions bitsandbytes/cuda_setup/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,16 +116,23 @@ def manual_override(self):
def run_cuda_setup(self):
self.initialized = True
self.cuda_setup_log = []


package_dir = Path(__file__).parent.parent
binary_name, cudart_path, cc, cuda_version_string = evaluate_cuda_setup()
# Find the correct suffix based on what we can see
for suffix in (".so", ".dll"):
binary_path = package_dir / f"{binary_name}{suffix}"
if binary_path.exists():
binary_name = f"{binary_name}{suffix}"
break

self.cudart_path = cudart_path
self.cuda_available = torch.cuda.is_available()
self.cc = cc
self.cuda_version_string = cuda_version_string
self.binary_name = binary_name
self.manual_override()

package_dir = Path(__file__).parent.parent
binary_path = package_dir / self.binary_name

try:
Expand All @@ -150,10 +157,10 @@ def run_cuda_setup(self):
self.add_log_entry('')
self.generate_instructions()
raise Exception('CUDA SETUP: Setup Failed!')
self.lib = ct.cdll.LoadLibrary(binary_path)
self.lib = ct.cdll.LoadLibrary(str(binary_path))
else:
self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
self.lib = ct.cdll.LoadLibrary(binary_path)
self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path!s}...")
self.lib = ct.cdll.LoadLibrary(str(binary_path))
except Exception as ex:
self.add_log_entry(str(ex))

Expand Down Expand Up @@ -332,7 +339,7 @@ def evaluate_cuda_setup():
cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'),
('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues'))
cuda_setup.add_log_entry('='*80)
if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None
if not torch.cuda.is_available(): return 'libbitsandbytes_cpu', None, None, None

cudart_path = determine_cuda_runtime_lib_path()
ccs = get_compute_capabilities()
Expand All @@ -356,9 +363,9 @@ def evaluate_cuda_setup():
# since most installations will have the libcudart.so installed, but not the compiler

if has_cublaslt:
binary_name = f"libbitsandbytes_cuda{cuda_version_string}.so"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}"
else:
"if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt.so"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
"if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt"
binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt"

return binary_name, cudart_path, cc, cuda_version_string
20 changes: 19 additions & 1 deletion compile_from_source.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Compiling from source

Basic steps.
Basic steps for Unix (see Windows steps below):
1. `CUDA_VERSION=XXX make [target]` where `[target]` is among `cuda92, cuda10x, cuda110, cuda11x, cuda12x, cpuonly`
2. `python setup.py install`

Expand Down Expand Up @@ -38,3 +38,21 @@ If you have problems compiling the library with these instructions from source,

Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler`

# Compilation on Windows

We'll use CMake to do all the heavy lifting for us here. CUDA and the MSVC compiler can be finicky.

- Install [Microsoft Visual Studio](https://visualstudio.microsoft.com/)
- Install the CUDA Toolkit to match your pytorch CUDA version
- This will install `CUDA xx.y.props` to `BuildCustomizations` (see some documentation [here](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html#sample-projects))
- i.e. for Visual Studio 2022 and CUDA 11.7, there should be some files `CUDA 11.7...` in here: `C:\Program Files\Microsoft Visual Studio\2022\Professional\MSBuild\Microsoft\VC\v170\BuildCustomizations`
- Install CMake, at least 3.18 (the latest version is usually fine)
- [Optional] Lookup your GPU's [CUDA Compute Capability](https://developer.nvidia.com/cuda-gpus)
- If you don't do this, it will compile optimized code for all possible compute capabilities, which takes much longer...
- Insert it into the command below (i.e. `8.6` -> `86`)
- Configure the CMake Project:
- `cmake -B build . "-DCOMPUTE_CAPABILITY=86"`
- Build the project
- `cmake --build build --config Release`
- Install bitsandbytes
- `pip install .`
21 changes: 19 additions & 2 deletions csrc/cpu_ops.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#include <BinSearch.h>
#ifdef _WIN32
#include <thread>
#else
#include <pthread.h>
#endif
#include <common.h>

using namespace BinSearch;
Expand Down Expand Up @@ -31,7 +35,11 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
{
long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
#ifdef _WIN32
std::thread *threads = (std::thread *) malloc(sizeof(std::thread) * valid_chunks);
#else
pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
#endif

struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *));

Expand All @@ -55,14 +63,23 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
arg->threadidx = block_idx / blocksize;
arg->blocksize = blocksize;

#ifdef _WIN32
new (&threads[chunks_processed]) std::thread(quantize_block, arg);
#else
pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg);
#endif
chunks_processed += 1;
if(chunks_processed == valid_chunks){ break; }
}

for (int i = 0; i < valid_chunks; i++)
{
#ifdef _WIN32
threads[i].join();
#else
int err = pthread_join(threads[i], NULL);

#endif
}
free(threads);
for (int i = 0; i < valid_chunks; i++)
free(args[i]);
Expand Down
Loading