Skip to content

Commit

Permalink
Merge pull request #45 from ROCm/fix_diagnostic_feedback
Browse files Browse the repository at this point in the history
Fix diagnostic feedback
  • Loading branch information
pnunna93 authored Sep 13, 2024
2 parents 1c5bd4f + 260a3ac commit 48bfb20
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 94 deletions.
2 changes: 1 addition & 1 deletion bitsandbytes/cextension.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def get_native_library() -> BNBNativeLibrary:
hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
BNB_BACKEND = "ROCM"
BNB_BACKEND = "ROCm"
else:
HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
BNB_HIP_VERSION_SHORT = ""
Expand Down
201 changes: 113 additions & 88 deletions bitsandbytes/diagnostics/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import torch

from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.consts import NONPYTORCH_DOC_URL
from bitsandbytes.cuda_specs import CUDASpecs
from bitsandbytes.diagnostics.utils import print_dedented
Expand All @@ -32,16 +32,18 @@
"_", # current Python interpreter
}

CUDA_RUNTIME_LIB_PATTERNS = (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)
logger = logging.getLogger(__name__)

if HIP_ENVIRONMENT:
CUDA_RUNTIME_LIB_PATTERNS = ("libamdhip64.so*",)

logger = logging.getLogger(__name__)
def get_runtime_lib_patterns() -> tuple:
if HIP_ENVIRONMENT:
return ("libamdhip64.so*",)
else:
return (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)


def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
Expand All @@ -58,8 +60,8 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
continue
except OSError: # Assume an esoteric error trying to poke at the directory
pass
for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
for pth in dir.rglob(lib_pattern):
for lib_pattern in get_runtime_lib_patterns():
for pth in dir.glob(lib_pattern):
if pth.is_file() and not pth.is_symlink():
yield pth
except (OSError, PermissionError):
Expand Down Expand Up @@ -107,59 +109,38 @@ def find_cudart_libraries() -> Iterator[Path]:
yield from find_cuda_libraries_in_path_list(value)


def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
if not HIP_ENVIRONMENT:
print(
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
)
else:
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
print(
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
)

binary_path = get_cuda_bnb_library_path(cuda_specs)
if not binary_path.exists():
if not HIP_ENVIRONMENT:
print_dedented(
f"""
Library not found: {binary_path}. Maybe you need to compile it from source?
If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
for example, `make CUDA_VERSION=113`.
The CUDA version for the compile might depend on your conda install, if using conda.
Inspect CUDA version via `conda list | grep cuda`.
""",
)
else:
print_dedented(
f"""
Library not found: {binary_path}.
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
in PyTorch Settings matches your ROCM install. If not, reinstall PyTorch for your ROCm version
and rebuild bitsandbytes.
""",
)
print_dedented(
f"""
Library not found: {binary_path}. Maybe you need to compile it from source?
If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
for example, `make CUDA_VERSION=113`.
The CUDA version for the compile might depend on your conda install, if using conda.
Inspect CUDA version via `conda list | grep cuda`.
""",
)

cuda_major, cuda_minor = cuda_specs.cuda_version_tuple
if not HIP_ENVIRONMENT:
if cuda_major < 11:
print_dedented(
"""
WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
You will be only to use 8-bit optimizers and quantization routines!
""",
)

print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")
else:
if (cuda_major, cuda_minor) < (6, 1):
print_dedented(
"""
WARNING: bitandbytes is fully supported only from ROCm 6.1.
""",
)
if cuda_major < 11:
print_dedented(
"""
WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
You will be only to use 8-bit optimizers and quantization routines!
""",
)

print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")

# 7.5 is the minimum CC for cublaslt
if not cuda_specs.has_cublaslt and not HIP_ENVIRONMENT:
if not cuda_specs.has_cublaslt:
print_dedented(
"""
WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU!
Expand All @@ -173,44 +154,88 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
# (2) Multiple CUDA versions installed


def print_cuda_runtime_diagnostics() -> None:
def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")

binary_path = get_cuda_bnb_library_path(cuda_specs)
if not binary_path.exists():
print_dedented(
f"""
Library not found: {binary_path}.
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
and rebuild bitsandbytes.
""",
)

hip_major, hip_minor = cuda_specs.cuda_version_tuple
if (hip_major, hip_minor) < (6, 1):
print_dedented(
"""
WARNING: bitsandbytes is fully supported only from ROCm 6.1.
""",
)


def print_diagnostics(cuda_specs: CUDASpecs) -> None:
if HIP_ENVIRONMENT:
_print_hip_diagnostics(cuda_specs)
else:
_print_cuda_diagnostics(cuda_specs)


def _print_cuda_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print(f"{BNB_BACKEND} SETUP: WARNING! {BNB_BACKEND} runtime files not found in any environmental path.")
print("WARNING! CUDA runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
backend_version = torch.version.cuda if not HIP_ENVIRONMENT else torch.version.hip
print_dedented(
f"""
Found duplicate {BNB_BACKEND} runtime files (see below).
Found duplicate CUDA runtime files (see below).
We select the PyTorch default CUDA runtime, which is {torch.version.cuda},
but this might mismatch with the CUDA version that is needed for bitsandbytes.
To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.
For example, if you want to use the CUDA version 122,
BNB_CUDA_VERSION=122 python ...
OR set the environmental variable in your .bashrc:
export BNB_CUDA_VERSION=122
We select the PyTorch default {BNB_BACKEND} runtime, which is {backend_version},
but this might mismatch with the {BNB_BACKEND} version that is needed for bitsandbytes.
In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
""",
)
for pth in cudart_paths:
print(f"* Found CUDA runtime at: {pth}")


def _print_hip_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print("WARNING! ROCm runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
print_dedented(
f"""
Found duplicate ROCm runtime files (see below).
We select the PyTorch default ROCm runtime, which is {torch.version.hip},
but this might mismatch with the ROCm version that is needed for bitsandbytes.
To resolve it, install PyTorch built for the ROCm version you want to use
and set LD_LIBRARY_PATH to your ROCm install path, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
""",
)
if not HIP_ENVIRONMENT:
print_dedented(
"""
To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.
For example, if you want to use the CUDA version 122,
BNB_CUDA_VERSION=122 python ...
OR set the environmental variable in your .bashrc:
export BNB_CUDA_VERSION=122
In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
""",
)
else:
print_dedented(
"""
To resolve it, install PyTorch built for the ROCm version you want to use
and set LD_LIBRARY_PATH to your ROCm install path, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/rocm-6.1.2,
""",
)

for pth in cudart_paths:
print(f"* Found {BNB_BACKEND} runtime at: {pth}")
print(f"* Found ROCm runtime at: {pth}")


def print_runtime_diagnostics() -> None:
if HIP_ENVIRONMENT:
_print_hip_runtime_diagnostics()
else:
_print_cuda_runtime_diagnostics()
8 changes: 4 additions & 4 deletions bitsandbytes/diagnostics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from bitsandbytes.consts import PACKAGE_GITHUB_URL
from bitsandbytes.cuda_specs import get_cuda_specs
from bitsandbytes.diagnostics.cuda import (
print_cuda_diagnostics,
print_cuda_runtime_diagnostics,
print_diagnostics,
print_runtime_diagnostics,
)
from bitsandbytes.diagnostics.utils import print_dedented, print_header

Expand Down Expand Up @@ -63,8 +63,8 @@ def main():
print(f"2. {BNB_BACKEND} not installed")
print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
if cuda_specs:
print_cuda_diagnostics(cuda_specs)
print_cuda_runtime_diagnostics()
print_diagnostics(cuda_specs)
print_runtime_diagnostics()
print_header("")
print_header("DEBUG INFO END")
print_header("")
Expand Down
2 changes: 1 addition & 1 deletion csrc/ops.hip
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
if (returnedAlgoCount == 0)
{
has_error = 1;
printf("Error: Matmul Algo Heurisitic didn't return algorithms\n");
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
}
else
{
Expand Down

0 comments on commit 48bfb20

Please sign in to comment.