Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix diagnostic feedback #45

Merged
merged 7 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bitsandbytes/cextension.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def get_native_library() -> BNBNativeLibrary:
hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
BNB_BACKEND = "ROCM"
BNB_BACKEND = "ROCm"
else:
HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
BNB_HIP_VERSION_SHORT = ""
Expand Down
201 changes: 113 additions & 88 deletions bitsandbytes/diagnostics/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import torch

from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.consts import NONPYTORCH_DOC_URL
from bitsandbytes.cuda_specs import CUDASpecs
from bitsandbytes.diagnostics.utils import print_dedented
Expand All @@ -32,16 +32,18 @@
"_", # current Python interpreter
}

CUDA_RUNTIME_LIB_PATTERNS = (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)
logger = logging.getLogger(__name__)

if HIP_ENVIRONMENT:
CUDA_RUNTIME_LIB_PATTERNS = ("libamdhip64.so*",)

logger = logging.getLogger(__name__)
def get_runtime_lib_patterns() -> tuple:
if HIP_ENVIRONMENT:
return ("libamdhip64.so*",)
else:
return (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)


def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
Expand All @@ -58,8 +60,8 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
continue
except OSError: # Assume an esoteric error trying to poke at the directory
pass
for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
for pth in dir.rglob(lib_pattern):
for lib_pattern in get_runtime_lib_patterns():
for pth in dir.glob(lib_pattern):
if pth.is_file() and not pth.is_symlink():
yield pth
except (OSError, PermissionError):
Expand Down Expand Up @@ -107,59 +109,38 @@ def find_cudart_libraries() -> Iterator[Path]:
yield from find_cuda_libraries_in_path_list(value)


def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
if not HIP_ENVIRONMENT:
print(
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
)
else:
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
print(
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
)

binary_path = get_cuda_bnb_library_path(cuda_specs)
if not binary_path.exists():
if not HIP_ENVIRONMENT:
print_dedented(
f"""
Library not found: {binary_path}. Maybe you need to compile it from source?
If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
for example, `make CUDA_VERSION=113`.

The CUDA version for the compile might depend on your conda install, if using conda.
Inspect CUDA version via `conda list | grep cuda`.
""",
)
else:
print_dedented(
f"""
Library not found: {binary_path}.
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
in PyTorch Settings matches your ROCM install. If not, reinstall PyTorch for your ROCm version
and rebuild bitsandbytes.
""",
)
print_dedented(
f"""
Library not found: {binary_path}. Maybe you need to compile it from source?
If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
for example, `make CUDA_VERSION=113`.

The CUDA version for the compile might depend on your conda install, if using conda.
Inspect CUDA version via `conda list | grep cuda`.
""",
)

cuda_major, cuda_minor = cuda_specs.cuda_version_tuple
if not HIP_ENVIRONMENT:
if cuda_major < 11:
print_dedented(
"""
WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
You will be only to use 8-bit optimizers and quantization routines!
""",
)

print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")
else:
if (cuda_major, cuda_minor) < (6, 1):
print_dedented(
"""
WARNING: bitandbytes is fully supported only from ROCm 6.1.
""",
)
if cuda_major < 11:
print_dedented(
"""
WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
You will be only to use 8-bit optimizers and quantization routines!
""",
)

print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")

# 7.5 is the minimum CC for cublaslt
if not cuda_specs.has_cublaslt and not HIP_ENVIRONMENT:
if not cuda_specs.has_cublaslt:
print_dedented(
"""
WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU!
Expand All @@ -173,44 +154,88 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
# (2) Multiple CUDA versions installed


def print_cuda_runtime_diagnostics() -> None:
def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")

binary_path = get_cuda_bnb_library_path(cuda_specs)
if not binary_path.exists():
print_dedented(
f"""
Library not found: {binary_path}.
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
and rebuild bitsandbytes.
""",
)

hip_major, hip_minor = cuda_specs.cuda_version_tuple
if (hip_major, hip_minor) < (6, 1):
print_dedented(
"""
WARNING: bitsandbytes is fully supported only from ROCm 6.1.
""",
)


def print_diagnostics(cuda_specs: CUDASpecs) -> None:
if HIP_ENVIRONMENT:
_print_hip_diagnostics(cuda_specs)
else:
_print_cuda_diagnostics(cuda_specs)


def _print_cuda_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print(f"{BNB_BACKEND} SETUP: WARNING! {BNB_BACKEND} runtime files not found in any environmental path.")
print("WARNING! CUDA runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
backend_version = torch.version.cuda if not HIP_ENVIRONMENT else torch.version.hip
print_dedented(
f"""
Found duplicate {BNB_BACKEND} runtime files (see below).
Found duplicate CUDA runtime files (see below).

We select the PyTorch default CUDA runtime, which is {torch.version.cuda},
but this might mismatch with the CUDA version that is needed for bitsandbytes.
To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.

For example, if you want to use the CUDA version 122,
BNB_CUDA_VERSION=122 python ...

OR set the environmental variable in your .bashrc:
export BNB_CUDA_VERSION=122

We select the PyTorch default {BNB_BACKEND} runtime, which is {backend_version},
but this might mismatch with the {BNB_BACKEND} version that is needed for bitsandbytes.
In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
""",
)
for pth in cudart_paths:
print(f"* Found CUDA runtime at: {pth}")


def _print_hip_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print("WARNING! ROCm runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
print_dedented(
f"""
Found duplicate ROCm runtime files (see below).

We select the PyTorch default ROCm runtime, which is {torch.version.hip},
but this might mismatch with the ROCm version that is needed for bitsandbytes.

To resolve it, install PyTorch built for the ROCm version you want to use

and set LD_LIBRARY_PATH to your ROCm install path, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
""",
)
if not HIP_ENVIRONMENT:
print_dedented(
"""
To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.

For example, if you want to use the CUDA version 122,
BNB_CUDA_VERSION=122 python ...

OR set the environmental variable in your .bashrc:
export BNB_CUDA_VERSION=122

In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
""",
)
else:
print_dedented(
"""
To resolve it, install PyTorch built for the ROCm version you want to use

and set LD_LIBRARY_PATH to your ROCm install path, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/rocm-6.1.2,
""",
)

for pth in cudart_paths:
print(f"* Found {BNB_BACKEND} runtime at: {pth}")
print(f"* Found ROCm runtime at: {pth}")


def print_runtime_diagnostics() -> None:
if HIP_ENVIRONMENT:
_print_hip_runtime_diagnostics()
else:
_print_cuda_runtime_diagnostics()
8 changes: 4 additions & 4 deletions bitsandbytes/diagnostics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from bitsandbytes.consts import PACKAGE_GITHUB_URL
from bitsandbytes.cuda_specs import get_cuda_specs
from bitsandbytes.diagnostics.cuda import (
print_cuda_diagnostics,
print_cuda_runtime_diagnostics,
print_diagnostics,
print_runtime_diagnostics,
)
from bitsandbytes.diagnostics.utils import print_dedented, print_header

Expand Down Expand Up @@ -63,8 +63,8 @@ def main():
print(f"2. {BNB_BACKEND} not installed")
print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
if cuda_specs:
print_cuda_diagnostics(cuda_specs)
print_cuda_runtime_diagnostics()
print_diagnostics(cuda_specs)
print_runtime_diagnostics()
print_header("")
print_header("DEBUG INFO END")
print_header("")
Expand Down
2 changes: 1 addition & 1 deletion csrc/ops.hip
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
if (returnedAlgoCount == 0)
{
has_error = 1;
printf("Error: Matmul Algo Heurisitic didn't return algorithms\n");
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
}
else
{
Expand Down
Loading