Enable line-ending and other hygiene lints (#1006)

bitsandbytes-foundation · Feb 1, 2024 · 6974920 · 6974920
1 parent 3a630c5
commit 6974920
Show file tree

Hide file tree

Showing 36 changed files with 97 additions and 113 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -18,15 +18,15 @@ body:
       label: Reproduction
       description: |
         Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
-        Please provide the simplest reproducer as possible so that we can quickly fix the issue. 
+        Please provide the simplest reproducer as possible so that we can quickly fix the issue.
 
       placeholder: |
-        Reproducer: 
-   
+        Reproducer:
+
   - type: textarea
     id: expected-behavior
     validations:
       required: true
     attributes:
       label: Expected behavior
-      description: "A clear and concise description of what you would expect to happen."
+      description: "A clear and concise description of what you would expect to happen."
diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -18,7 +18,7 @@ body:
     attributes:
       label: Motivation
       description: |
-        Please outline the motivation for the proposal. Is your feature request related to a problem? 
+        Please outline the motivation for the proposal. Is your feature request related to a problem?
 
   - type: textarea
     id: contribution
@@ -27,4 +27,4 @@ body:
     attributes:
       label: Your contribution
       description: |
-        Is there any way that you could help, e.g. by submitting a PR? 
+        Is there any way that you could help, e.g. by submitting a PR?
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -14,4 +14,4 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: TimDettmers
diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled
@@ -24,4 +24,4 @@ jobs:
         pip install PyGithub
     - name: Close stale issues
       run: |
-        python scripts/stale.py
+        python scripts/stale.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,3 +6,14 @@ repos:
         args:
           - --fix
       # - id: ruff-format  # TODO: enable when the time is right
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: fix-byte-order-marker
+      - id: trailing-whitespace
+      - id: mixed-line-ending
+        args:
+          - --fix=lf
diff --git a/.style.yapf b/.style.yapf
@@ -10,4 +10,4 @@ SPLIT_BEFORE_BITWISE_OPERATOR = True
 SPLIT_BEFORE_FIRST_ARGUMENT = True
 SPLIT_BEFORE_LOGICAL_OPERATOR = True
 SPLIT_BEFORE_NAMED_ASSIGNS = True
-SPLIT_COMPLEX_COMPREHENSION = True
+SPLIT_COMPLEX_COMPREHENSION = True
diff --git a/README.md b/README.md
@@ -153,10 +153,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`:

diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md
@@ -1,4 +1,4 @@
 Steps:
 
 1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling).
-2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed.
+2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed.
diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py
@@ -33,7 +33,7 @@
 
         ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'),
         ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'),
-        
+
         ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
         ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
         ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
@@ -55,7 +55,7 @@
                 y_ += df_[k_].values[0]
             ys.append(y_ * 0.5)
 
-        
+
         ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.)
 
 
@@ -67,7 +67,7 @@
     ax.set_xscale('log')
     if logscale_plot1:
         ax.set_yscale('log')
-    
+
     ax.tick_params(axis='x', labelsize=11)
     ax.tick_params(axis='y', labelsize=11)
 
@@ -91,7 +91,7 @@
             ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'),
             ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'),
         ]:
-        
+
             xs, ys = [], []
             df = rdf[rdf.batch_size == batch_size]
             for embed_dim in dims_to_consider:
@@ -133,4 +133,3 @@
 
 
     plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight')
-
diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py
@@ -42,7 +42,7 @@ def get_time(k, fn, info_dict):
     for dim in [1024, 1280, 1408, 1664, 2048, 4096]:
         # note "batch_size" is actually "batch_size * embed_dim", which is why it's large
         for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]:
-            
+
             # switch switches dim_in and dim_out
             for switch in [False, True]:
 
@@ -62,7 +62,7 @@ def get_time(k, fn, info_dict):
                 x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda()
                 g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda()
                 w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda()
-                
+
                 x_int8 = x.clone().to(torch.int8)
                 g_int8 = g.clone().to(torch.int8)
                 w_int8 = w.clone().to(torch.int8)

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
@@ -210,7 +210,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
             if path.exists():
                 existent_directories.add(path)
         except PermissionError:
-            # Handle the PermissionError first as it is a subtype of OSError 
+            # Handle the PermissionError first as it is a subtype of OSError
             # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
             pass
         except OSError as exc:

diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
@@ -35,4 +35,3 @@ class PagedAdamW32bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
-
diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py
@@ -83,7 +83,7 @@ def backward(ctx, grad_output):
         # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
 
         # not supported by PyTorch. TODO: create work-around
-        if req_gradA: 
+        if req_gradA:
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:
@@ -167,7 +167,7 @@ def backward(ctx, grad_output):
         # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2])
 
         # not supported by PyTorch. TODO: create work-around
-        if req_gradA: 
+        if req_gradA:
             grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype)
 
         if req_gradB:

diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py
@@ -50,7 +50,7 @@ def _dequantize_rowwise(
         max_val = tl.load(state_x + pid)
         output = max_val * x * inv_127
         tl.store(output_ptr + offsets, output, mask=row_mask)
-        
+
 
     def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor):
         output = torch.empty(*x.shape, device=x.device, dtype=torch.float16)

diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
@@ -120,7 +120,7 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N,
             acc += tl.dot(a, b)
             A += BLOCK_K * SPLIT_K * stride_ak
             B += BLOCK_K * SPLIT_K * stride_bk
-        
+
         acc = (w_factor * (x_factor * (acc * divfactor)))
         acc = acc.to(C.dtype.element_ty)
 

diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
@@ -119,7 +119,7 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M,
             acc += tl.dot(a, b)
             A += BLOCK_K * SPLIT_K * stride_ak
             B += BLOCK_K * SPLIT_K * stride_bk
-        
+
         acc = (w_factor * (x_factor * (acc * divfactor)))
         acc = acc.to(C.dtype.element_ty)
 

diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py
@@ -54,7 +54,7 @@ def _quantize_columnwise_and_transpose(
         max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0)
         output = tl.libdevice.llrint(127. * (x / max_val))
 
-        new_start = pid * M 
+        new_start = pid * M
         new_offsets = new_start + p2_arange
         tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask)
         tl.store(output_maxs + pid, max_val)
@@ -71,4 +71,3 @@ def quantize_columnwise_and_transpose(x: torch.Tensor):
         grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
         _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2)
         return output, output_maxs
-
diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py
@@ -59,27 +59,27 @@ def quantize_global(x: torch.Tensor):
             key=['M', 'N']
     )
     @triton.jit
-    def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, 
-                          BLOCK_M : tl.constexpr, 
-                          BLOCK_N : tl.constexpr, 
+    def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N,
+                          BLOCK_M : tl.constexpr,
+                          BLOCK_N : tl.constexpr,
                           GROUP_M : tl.constexpr):
         pid = tl.program_id(0)
         grid_m = (M + BLOCK_M - 1) // BLOCK_M
         grid_n = (N + BLOCK_N - 1) // BLOCK_N
-        
+
         width = GROUP_M * grid_n
         group_id = pid // width
         group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
         pid_m = group_id * GROUP_M + (pid % group_size)
         pid_n = (pid % width) // group_size
-        
+
         rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
         rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
         A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an)
         mask = (rm < M)[:, None] & (rn < N)[None, :]
         a = tl.load(A, mask=mask)
         absmax_inv = tl.load(absmax_inv_ptr)
-        
+
         # rematerialize to save registers
         rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
         rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
@@ -95,12 +95,11 @@ def quantize_global_transpose(input):
         absmax_inv = 1./ absmax
         M, N = input.shape
         out = torch.empty(N, M, device='cuda', dtype=torch.int8)
-        
+
         assert out.size(0) == N and out.size(1) == M
         assert input.stride(0) == 1 or input.stride(1) == 1
         assert out.stride(0) == 1 or out.stride(1) == 1
-        
+
         grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),)
         _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N)
         return out, absmax
-
diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py
@@ -46,7 +46,7 @@ def _quantize_rowwise(
         offsets = block_start + arange
         row_mask = arange < BLOCK_SIZE
         x = tl.load(x_ptr + offsets, mask=row_mask)
-        
+
         abs_x = tl.abs(x)
         max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0)
         output = tl.libdevice.llrint(127. * (x / max_val))
@@ -64,4 +64,3 @@ def quantize_rowwise(x: torch.Tensor):
         grid = lambda meta: (x.shape[0],)
         _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2)
         return output, output_maxs
-
diff --git a/compile_from_source.md b/compile_from_source.md
@@ -12,10 +12,10 @@ You can install CUDA locally without sudo by following the following steps:
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
-#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
+#   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash install_cuda.sh 117 ~/local 1 
+bash install_cuda.sh 117 ~/local 1
 ```
 
 By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler.
@@ -37,4 +37,3 @@ If you have problems compiling the library with these instructions from source,
 ## Compilation with Kepler
 
 Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler`
-