bitsandbytes-foundation · Titus-von-Koeller · Apr 3, 2024 · Nov 10, 2023 · Nov 28, 2023 · Nov 30, 2023
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from . import cuda_setup, research, utils
+from . import device_setup, research, utils
 from .autograd._functions import (
     MatmulLtState,
     bmm_cublas,
@@ -17,7 +17,9 @@
 
 if COMPILED_WITH_CUDA:
     from .optim import adam
-
+    from .backends import register_backend
+    from .backends.cuda import CUDABackend
+    register_backend("cuda", CUDABackend())
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,

diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py
@@ -57,9 +57,8 @@ def print_debug_info() -> None:
 
 def main():
     generate_bug_report_information()
-
     from . import COMPILED_WITH_CUDA
-    from .cuda_setup.main import get_compute_capabilities
+    from .device_setup.cuda.main import get_compute_capabilities
 
     print_header("OTHER")
     print(f"COMPILED_WITH_CUDA = {COMPILED_WITH_CUDA}")

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -230,6 +230,10 @@ def supports_igemmlt(device: torch.device) -> bool:
     nvidia16_models = ('GTX 1630', 'GTX 1650', 'GTX 1660')  # https://en.wikipedia.org/wiki/GeForce_16_series
     if any(model_name in device_name for model_name in nvidia16_models):
         return False  # these devices are technically cuda 7.5-capable, but they lack tensor cores
+    if device.type == "cpu":
+        #TODO: will return True once CPU backend upstream the supports
+        return False
+
     return True
 
 
@@ -564,7 +568,7 @@ def matmul(
 
 def matmul_4bit(A: torch.Tensor, B: torch.Tensor, quant_state: F.QuantState, out: Optional[torch.Tensor] = None, bias=None):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False:
+    if A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type == "cuda":
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}')
             return MatMul4Bit.apply(A, B, out, bias, quant_state)

diff --git a/bitsandbytes/backends/__init__.py b/bitsandbytes/backends/__init__.py
@@ -0,0 +1,9 @@
+from typing import Dict
+import torch
+
+from bitsandbytes.backends.base import Backend
+
+backends: Dict[str, Backend] = {}
+
+def register_backend(backend_name: str, backend_instance: Backend):
+    backends[backend_name.lower()] = backend_instance
diff --git a/bitsandbytes/backends/base.py b/bitsandbytes/backends/base.py
@@ -0,0 +1,133 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Tuple
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+
+class Backend(ABC):
+    """Base class for devices backends that will implement their own 8bits and 4bits functions."""
+
+    @abstractmethod
+    def double_quant(
+        self,
+        A,
+        col_stats=None,
+        row_stats=None,
+        out_col=None,
+        out_row=None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def transform(
+        self,
+        A,
+        to_order,
+        from_order="row",
+        out=None,
+        transpose=False,
+        state=None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+        raise NotImplementedError
+
+    @abstractmethod
+    def mm_dequant(
+        self,
+        A,
+        quant_state,
+        row_stats,
+        col_stats,
+        out=None,
+        new_row_stats=None,
+        new_col_stats=None,
+        bias=None,
+    ):
+        raise NotImplementedError
+
+    @abstractmethod
+    def extract_outliers(self, A, SA, idx):
+        raise NotImplementedError
+
+    @abstractmethod
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type="fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        """
+        Quantize tensor A in blocks of 4-bit values.
+
+        Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input tensor.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            The output tensor.
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+        Returns
+        -------
+        torch.Tensor:
+            Tensor with packed 4-bit values.
+        tuple(torch.Tensor, torch.Size, torch.dtype, int):
+            The quantization state to undo the quantization.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type="fp4",
+    ) -> torch.Tensor:
+        """
+        Dequantizes FP4 blockwise quantized values.
+
+        Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+        Parameters
+        ----------
+        A : torch.Tensor
+            The input tensor (packed 4-bit values).
+        quant_state : QuantState
+            object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+        absmax : torch.Tensor
+            The absmax values.
+        out : torch.Tensor
+            Dequantized output tensor.
+        blocksize : int
+            The blocksize used in quantization.
+        quant_type : str
+            The 4-bit quantization data type {fp4, nf4}
+
+
+        Returns
+        -------
+        torch.Tensor:
+            Dequantized tensor.
+        """
+        raise NotImplementedError