diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index ac8e9de00..6ae3c7c0a 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -18,15 +18,15 @@ body:
       label: Reproduction
       description: |
         Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
-        Please provide the simplest reproducer as possible so that we can quickly fix the issue. 
+        Please provide the simplest reproducer as possible so that we can quickly fix the issue.
 
       placeholder: |
-        Reproducer: 
-   
+        Reproducer:
+
   - type: textarea
     id: expected-behavior
     validations:
       required: true
     attributes:
       label: Expected behavior
-      description: "A clear and concise description of what you would expect to happen."
\ No newline at end of file
+      description: "A clear and concise description of what you would expect to happen."
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
index 4e75c2a64..c39f346b9 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -18,7 +18,7 @@ body:
     attributes:
       label: Motivation
       description: |
-        Please outline the motivation for the proposal. Is your feature request related to a problem? 
+        Please outline the motivation for the proposal. Is your feature request related to a problem?
 
   - type: textarea
     id: contribution
@@ -27,4 +27,4 @@ body:
     attributes:
       label: Your contribution
       description: |
-        Is there any way that you could help, e.g. by submitting a PR? 
\ No newline at end of file
+        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index dace206b1..40ea8b5bc 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -14,4 +14,4 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
\ No newline at end of file
+      repo_owner: TimDettmers
diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled
index ec011c7fb..0b4f789ea 100644
--- a/.github/workflows/stale.yml.disabled
+++ b/.github/workflows/stale.yml.disabled
@@ -24,4 +24,4 @@ jobs:
         pip install PyGithub
     - name: Close stale issues
       run: |
-        python scripts/stale.py
\ No newline at end of file
+        python scripts/stale.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d568a849f..039139b95 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,3 +6,14 @@ repos:
         args:
           - --fix
       # - id: ruff-format  # TODO: enable when the time is right
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: fix-byte-order-marker
+      - id: trailing-whitespace
+      - id: mixed-line-ending
+        args:
+          - --fix=lf
diff --git a/.style.yapf b/.style.yapf
index a185235cf..e60ac16e5 100644
--- a/.style.yapf
+++ b/.style.yapf
@@ -10,4 +10,4 @@ SPLIT_BEFORE_BITWISE_OPERATOR = True
 SPLIT_BEFORE_FIRST_ARGUMENT = True
 SPLIT_BEFORE_LOGICAL_OPERATOR = True
 SPLIT_BEFORE_NAMED_ASSIGNS = True
-SPLIT_COMPLEX_COMPREHENSION = True
\ No newline at end of file
+SPLIT_COMPLEX_COMPREHENSION = True
diff --git a/README.md b/README.md
index a4586d6ca..61dede8c1 100644
--- a/README.md
+++ b/README.md
@@ -153,10 +153,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md
index bb33b5bbd..b73569030 100644
--- a/benchmarking/switchback/README.md
+++ b/benchmarking/switchback/README.md
@@ -1,4 +1,4 @@
 Steps:
 
 1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling).
-2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed.
\ No newline at end of file
+2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed.
diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py
index 3ef87d6b2..177270346 100644
--- a/benchmarking/switchback/make_plot_with_jsonl.py
+++ b/benchmarking/switchback/make_plot_with_jsonl.py
@@ -33,7 +33,7 @@
 
         ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'),
         ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'),
-        
+
         ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
         ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
         ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
@@ -55,7 +55,7 @@
                 y_ += df_[k_].values[0]
             ys.append(y_ * 0.5)
 
-        
+
         ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.)
 
 
@@ -67,7 +67,7 @@
     ax.set_xscale('log')
     if logscale_plot1:
         ax.set_yscale('log')
-    
+
     ax.tick_params(axis='x', labelsize=11)
     ax.tick_params(axis='y', labelsize=11)
 
@@ -91,7 +91,7 @@
             ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'),
             ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'),
         ]:
-        
+
             xs, ys = [], []
             df = rdf[rdf.batch_size == batch_size]
             for embed_dim in dims_to_consider:
@@ -133,4 +133,3 @@
 
 
     plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight')
-
diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py
index d70df0386..c4f3cd4c6 100644
--- a/benchmarking/switchback/speed_benchmark.py
+++ b/benchmarking/switchback/speed_benchmark.py
@@ -42,7 +42,7 @@ def get_time(k, fn, info_dict):
     for dim in [1024, 1280, 1408, 1664, 2048, 4096]:
         # note "batch_size" is actually "batch_size * embed_dim", which is why it's large
         for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]:
-            
+
             # switch switches dim_in and dim_out
             for switch in [False, True]:
 
@@ -62,7 +62,7 @@ def get_time(k, fn, info_dict):
                 x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda()
                 g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda()
                 w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda()
-                
+
                 x_int8 = x.clone().to(torch.int8)
                 g_int8 = g.clone().to(torch.int8)
                 w_int8 = w.clone().to(torch.int8)
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index a34385b1f..0db9df343 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -210,7 +210,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
             if path.exists():
                 existent_directories.add(path)
         except PermissionError:
-            # Handle the PermissionError first as it is a subtype of OSError 
+            # Handle the PermissionError first as it is a subtype of OSError
             # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
             pass
         except OSError as exc:
diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
index 9ea5812ea..17383eed5 100644
--- a/bitsandbytes/optim/adamw.py
+++ b/bitsandbytes/optim/adamw.py
@@ -35,4 +35,3 @@ class PagedAdamW32bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
-
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
index e515bfeff..7d869e39a 100644
--- a/bitsandbytes/research/autograd/_functions.py
+++ b/bitsandbytes/research/autograd/_functions.py
@@ -83,7 +83,7 @@ def backward(ctx, grad_output):
         # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
 
         # not supported by PyTorch. TODO: create work-around
-        if req_gradA: 
+        if req_gradA:
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:
@@ -167,7 +167,7 @@ def backward(ctx, grad_output):
         # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
 
         # not supported by PyTorch. TODO: create work-around
-        if req_gradA: 
+        if req_gradA:
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:
diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py
index daa59da9c..3d7529852 100644
--- a/bitsandbytes/triton/dequantize_rowwise.py
+++ b/bitsandbytes/triton/dequantize_rowwise.py
@@ -50,7 +50,7 @@ def _dequantize_rowwise(
         max_val = tl.load(state_x + pid)
         output = max_val * x * inv_127
         tl.store(output_ptr + offsets, output, mask=row_mask)
-        
+
 
     def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor):
         output = torch.empty(*x.shape, device=x.device, dtype=torch.float16)
diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
index 1b80ab1a0..dc3047d7e 100644
--- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
+++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
@@ -120,7 +120,7 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N,
             acc += tl.dot(a, b)
             A += BLOCK_K * SPLIT_K * stride_ak
             B += BLOCK_K * SPLIT_K * stride_bk
-        
+
         acc = (w_factor * (x_factor * (acc * divfactor)))
         acc = acc.to(C.dtype.element_ty)
 
diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
index 1f28b0d10..4881e1468 100644
--- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
+++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
@@ -119,7 +119,7 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M,
             acc += tl.dot(a, b)
             A += BLOCK_K * SPLIT_K * stride_ak
             B += BLOCK_K * SPLIT_K * stride_bk
-        
+
         acc = (w_factor * (x_factor * (acc * divfactor)))
         acc = acc.to(C.dtype.element_ty)
 
diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py
index fcadaba3e..e7961cf53 100644
--- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py
+++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py
@@ -54,7 +54,7 @@ def _quantize_columnwise_and_transpose(
         max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0)
         output = tl.libdevice.llrint(127. * (x / max_val))
 
-        new_start = pid * M 
+        new_start = pid * M
         new_offsets = new_start + p2_arange
         tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask)
         tl.store(output_maxs + pid, max_val)
@@ -71,4 +71,3 @@ def quantize_columnwise_and_transpose(x: torch.Tensor):
         grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
         _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2)
         return output, output_maxs
-
diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py
index a73a5bbaa..5cf194744 100644
--- a/bitsandbytes/triton/quantize_global.py
+++ b/bitsandbytes/triton/quantize_global.py
@@ -59,27 +59,27 @@ def quantize_global(x: torch.Tensor):
             key=['M', 'N']
     )
     @triton.jit
-    def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, 
-                          BLOCK_M : tl.constexpr, 
-                          BLOCK_N : tl.constexpr, 
+    def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N,
+                          BLOCK_M : tl.constexpr,
+                          BLOCK_N : tl.constexpr,
                           GROUP_M : tl.constexpr):
         pid = tl.program_id(0)
         grid_m = (M + BLOCK_M - 1) // BLOCK_M
         grid_n = (N + BLOCK_N - 1) // BLOCK_N
-        
+
         width = GROUP_M * grid_n
         group_id = pid // width
         group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
         pid_m = group_id * GROUP_M + (pid % group_size)
         pid_n = (pid % width) // group_size
-        
+
         rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
         rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
         A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an)
         mask = (rm < M)[:, None] & (rn < N)[None, :]
         a = tl.load(A, mask=mask)
         absmax_inv = tl.load(absmax_inv_ptr)
-        
+
         # rematerialize to save registers
         rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
         rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -95,12 +95,11 @@ def quantize_global_transpose(input):
         absmax_inv = 1./ absmax
         M, N = input.shape
         out = torch.empty(N, M, device='cuda', dtype=torch.int8)
-        
+
         assert out.size(0) == N and out.size(1) == M
         assert input.stride(0) == 1 or input.stride(1) == 1
         assert out.stride(0) == 1 or out.stride(1) == 1
-        
+
         grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),)
         _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N)
         return out, absmax
-
diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py
index fce464b19..078f4aa2d 100644
--- a/bitsandbytes/triton/quantize_rowwise.py
+++ b/bitsandbytes/triton/quantize_rowwise.py
@@ -46,7 +46,7 @@ def _quantize_rowwise(
         offsets = block_start + arange
         row_mask = arange < BLOCK_SIZE
         x = tl.load(x_ptr + offsets, mask=row_mask)
-        
+
         abs_x = tl.abs(x)
         max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0)
         output = tl.libdevice.llrint(127. * (x / max_val))
@@ -64,4 +64,3 @@ def quantize_rowwise(x: torch.Tensor):
         grid = lambda meta: (x.shape[0],)
         _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2)
         return output, output_maxs
-
diff --git a/compile_from_source.md b/compile_from_source.md
index 23afe1591..6310fd6c6 100644
--- a/compile_from_source.md
+++ b/compile_from_source.md
@@ -12,10 +12,10 @@ You can install CUDA locally without sudo by following the following steps:
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.
@@ -37,4 +37,3 @@ If you have problems compiling the library with these instructions from source,
 ## Compilation with Kepler
 
 Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler`
-
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 0fff83665..f117547ed 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -110,7 +110,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax)
         return 1.00000000f*absmax*sign; // 1011
       else
         return 0.66666667f*absmax*sign; // 1010
-    else 
+    else
       if((val & 0b0001) == 1) // 100
         return 5.208333333e-03f*absmax*sign; // 1001
       else
@@ -174,36 +174,36 @@ __device__ half dhDequantizeNF4(unsigned char val)
     if((val & 0b0100) == 4) // 1
       if((val & 0b0010) == 2) // 11
         if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
         else
           return 0.7229568362236023f;
       else
         if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
         else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
     else
       if((val & 0b0010) == 2) //10
         if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
         else
-          return 0.24611230194568634f; 
-      else 
+          return 0.24611230194568634f;
+      else
         if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
         else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
 
   else
     if((val & 0b0100) == 4) // 0
       if((val & 0b0010) == 2) //01
         if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
         else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
       else
         if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
         else
           return -0.28444138169288635f;
     else
@@ -211,12 +211,12 @@ __device__ half dhDequantizeNF4(unsigned char val)
         if((val & 0b0001) == 1) // 001
           return -0.39491748809814453f;
         else
-          return -0.5250730514526367f; 
-      else 
+          return -0.5250730514526367f;
+      else
         if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
         else
-          return -1.0f; 
+          return -1.0f;
 
 }
 
@@ -229,36 +229,36 @@ __device__ float dDequantizeNF4(unsigned char val)
     if((val & 0b0100) == 4) // 1
       if((val & 0b0010) == 2) // 11
         if((val & 0b0001) == 1) // 111
-          return 1.0f; 
+          return 1.0f;
         else
           return 0.7229568362236023f;
       else
         if((val & 0b0001) == 1) // 110
-          return 0.5626170039176941f; 
+          return 0.5626170039176941f;
         else
-          return 0.44070982933044434f; 
+          return 0.44070982933044434f;
     else
       if((val & 0b0010) == 2) //10
         if((val & 0b0001) == 1) // 101
-          return 0.33791524171829224f; 
+          return 0.33791524171829224f;
         else
-          return 0.24611230194568634f; 
-      else 
+          return 0.24611230194568634f;
+      else
         if((val & 0b0001) == 1) // 100
-          return 0.16093020141124725f; 
+          return 0.16093020141124725f;
         else
-          return 0.07958029955625534f; 
+          return 0.07958029955625534f;
 
   else
     if((val & 0b0100) == 4) // 0
       if((val & 0b0010) == 2) //01
         if((val & 0b0001) == 1) // 011
-          return 0.0f; 
+          return 0.0f;
         else
-          return -0.09105003625154495f; 
+          return -0.09105003625154495f;
       else
         if((val & 0b0001) == 1) // 010
-          return -0.18477343022823334f; 
+          return -0.18477343022823334f;
         else
           return -0.28444138169288635f;
     else
@@ -266,12 +266,12 @@ __device__ float dDequantizeNF4(unsigned char val)
         if((val & 0b0001) == 1) // 001
           return -0.39491748809814453f;
         else
-          return -0.5250730514526367f; 
-      else 
+          return -0.5250730514526367f;
+      else
         if((val & 0b0001) == 1) // 000
-          return -0.6961928009986877f; 
+          return -0.6961928009986877f;
         else
-          return -1.0f; 
+          return -1.0f;
 
 }
 
@@ -1863,7 +1863,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
               //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps);
               //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val;
               g_val *= gnorm_scale;
-              
+
 							s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val));
 
 							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
@@ -3069,7 +3069,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// use k warps per thread block
 //// 1. threadblock use read-only cache to read in register tile for A into shared memory
 //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments
-//// 3. each warp reads a segment of values 16x32 from B 
+//// 3. each warp reads a segment of values 16x32 from B
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
@@ -3531,7 +3531,7 @@ template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, i
 template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B,  float *absmax, const float *datatype, T * out,  int lda, int ldb, int ldc, int blocksize)
 {
 
-  // per threadblock: 
+  // per threadblock:
   // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
   // 4 warps -> 4 loads per iter
   // 1x32 * 32x4 -> 1x4 outputs per thread block
@@ -3764,7 +3764,7 @@ template <typename T, int FUNC> __global__ void kfunc(T *A, T *B, T value, long
   {
     switch(FUNC)
     {
-      case FILL: 
+      case FILL:
         A[i] = (T)value;
         break;
       case ARANGE:
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index 865e4b6d5..087ae3921 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -389,7 +389,7 @@ extern "C"
 		int hasPrefetch = 0;
 		CUDA_CHECK_RETURN(cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // 40ns overhead
 		if (hasPrefetch == 0) return;
- 
+
 		CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0));
 		CUDA_CHECK_RETURN(cudaPeekAtLastError());
 	}
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 28da69eb0..043597177 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -1,8 +1,8 @@
-- sections: 
+- sections:
   - local: index
     title: Bits & Bytes
   - local: quickstart
     title: Quickstart
   - local: installation
     title: Installation
-  title: Get started
\ No newline at end of file
+  title: Get started
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 68ad433e6..67c928309 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -149,10 +149,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:
@@ -188,4 +188,4 @@ For 8-bit optimizers or quantization routines, please consider citing the follow
   journal={9th International Conference on Learning Representations, ICLR},
   year={2022}
 }
-```
\ No newline at end of file
+```
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
index 4dff2ba46..d1028c655 100644
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@@ -6,7 +6,7 @@
 
 ## Minimal example
 
-The following code illustrates the steps above. 
+The following code illustrates the steps above.
 
 ```python
-```
\ No newline at end of file
+```
diff --git a/environment.yml b/environment.yml
index c0e07f153..9ab48dedc 100644
--- a/environment.yml
+++ b/environment.yml
@@ -42,4 +42,4 @@ dependencies:
 
 ## ENV UPDATE:
 # # add new packages to environment.yml, then:
-# mamba env update -n bnb -f environment.yml
\ No newline at end of file
+# mamba env update -n bnb -f environment.yml
diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py
index dc80a44db..2cee48e8e 100644
--- a/examples/int8_inference_huggingface.py
+++ b/examples/int8_inference_huggingface.py
@@ -22,6 +22,3 @@
 )
 generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
 print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
-
-
-
diff --git a/how_to_use_nonpytorch_cuda.md b/how_to_use_nonpytorch_cuda.md
index b5f01fbe5..566b0170e 100644
--- a/how_to_use_nonpytorch_cuda.md
+++ b/how_to_use_nonpytorch_cuda.md
@@ -18,7 +18,7 @@ You can also install CUDA version that you need locally with a script provided b
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
 
diff --git a/install_cuda.py b/install_cuda.py
index 77e258609..4b041b8d0 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -49,13 +49,13 @@ def install_cuda(version, base_path, download_path):
     # Install CUDA
     print(f"Installing CUDA version {version}...")
     install_command = [
-        "bash", filepath, 
-        "--no-drm", "--no-man-page", "--override", 
+        "bash", filepath,
+        "--no-drm", "--no-man-page", "--override",
         "--toolkitpath=" + install_path, "--toolkit", "--silent"
     ]
 
     print(f"Running command: {' '.join(install_command)}")
-    
+
     try:
         subprocess.run(install_command, check=True)
     except subprocess.CalledProcessError as e:
@@ -99,4 +99,4 @@ def main():
         sys.exit(1)
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/scripts/stale.py b/scripts/stale.py
index c299643ae..613f5b7cb 100644
--- a/scripts/stale.py
+++ b/scripts/stale.py
@@ -55,4 +55,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 7e70a30ca..d01e5e9db 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -519,4 +519,3 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
                     torch.testing.assert_close(
                         gradB1, gradB2, atol=0.18, rtol=0.3
                     )
-
diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py
index 5e1a548e5..189aa75b5 100644
--- a/tests/test_cuda_setup_evaluator.py
+++ b/tests/test_cuda_setup_evaluator.py
@@ -19,11 +19,3 @@ def test_manual_override(requires_cuda):
     import bitsandbytes as bnb
     loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name
     #assert loaded_lib == 'libbitsandbytes_cuda122.so'
-
-
-
-
-
-
-
-
diff --git a/tests/test_functional.py b/tests/test_functional.py
index f4b8fca51..2d4e959ad 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2345,5 +2345,3 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant):
         torch.testing.assert_close(A, C2)
         #torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001)
         #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080)
-
-
diff --git a/tests/test_generation.py b/tests/test_generation.py
index 9ed30cd2a..b05749bf8 100644
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
@@ -120,6 +120,3 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype):
         for out in outputs:
             print(out)
         raise ValueError(f'Failure count: {failure_count}/{n_cases}')
-
-
-
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 1cb04044f..32d90938d 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -637,6 +637,3 @@ def test_4bit_warnings():
         net(inp)
 
     assert len(record) == 2
-
-
-
diff --git a/tests/test_triton.py b/tests/test_triton.py
index 943db067a..218a533d5 100644
--- a/tests/test_triton.py
+++ b/tests/test_triton.py
@@ -58,4 +58,3 @@ def test_switchback(vector_wise_quantization):
 
             print('GX1', err_sb, err_baseline)
             assert err_sb < 2 * err_baseline
-