diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index ac8e9de00..6ae3c7c0a 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -18,15 +18,15 @@ body: label: Reproduction description: | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. - Please provide the simplest reproducer as possible so that we can quickly fix the issue. + Please provide the simplest reproducer as possible so that we can quickly fix the issue. placeholder: | - Reproducer: - + Reproducer: + - type: textarea id: expected-behavior validations: required: true attributes: label: Expected behavior - description: "A clear and concise description of what you would expect to happen." \ No newline at end of file + description: "A clear and concise description of what you would expect to happen." diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 4e75c2a64..c39f346b9 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -18,7 +18,7 @@ body: attributes: label: Motivation description: | - Please outline the motivation for the proposal. Is your feature request related to a problem? + Please outline the motivation for the proposal. Is your feature request related to a problem? - type: textarea id: contribution @@ -27,4 +27,4 @@ body: attributes: label: Your contribution description: | - Is there any way that you could help, e.g. by submitting a PR? \ No newline at end of file + Is there any way that you could help, e.g. by submitting a PR? diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index dace206b1..40ea8b5bc 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -14,4 +14,4 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: bitsandbytes - repo_owner: TimDettmers \ No newline at end of file + repo_owner: TimDettmers diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled index ec011c7fb..0b4f789ea 100644 --- a/.github/workflows/stale.yml.disabled +++ b/.github/workflows/stale.yml.disabled @@ -24,4 +24,4 @@ jobs: pip install PyGithub - name: Close stale issues run: | - python scripts/stale.py \ No newline at end of file + python scripts/stale.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d568a849f..039139b95 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,3 +6,14 @@ repos: args: - --fix # - id: ruff-format # TODO: enable when the time is right + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-merge-conflict + - id: check-yaml + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: trailing-whitespace + - id: mixed-line-ending + args: + - --fix=lf diff --git a/.style.yapf b/.style.yapf index a185235cf..e60ac16e5 100644 --- a/.style.yapf +++ b/.style.yapf @@ -10,4 +10,4 @@ SPLIT_BEFORE_BITWISE_OPERATOR = True SPLIT_BEFORE_FIRST_ARGUMENT = True SPLIT_BEFORE_LOGICAL_OPERATOR = True SPLIT_BEFORE_NAMED_ASSIGNS = True -SPLIT_COMPLEX_COMPREHENSION = True \ No newline at end of file +SPLIT_COMPLEX_COMPREHENSION = True diff --git a/README.md b/README.md index a4586d6ca..61dede8c1 100644 --- a/README.md +++ b/README.md @@ -153,10 +153,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md index bb33b5bbd..b73569030 100644 --- a/benchmarking/switchback/README.md +++ b/benchmarking/switchback/README.md @@ -1,4 +1,4 @@ Steps: 1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling). -2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. \ No newline at end of file +2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py index 3ef87d6b2..177270346 100644 --- a/benchmarking/switchback/make_plot_with_jsonl.py +++ b/benchmarking/switchback/make_plot_with_jsonl.py @@ -33,7 +33,7 @@ ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'), ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'), - + ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'), ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'), ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'), @@ -55,7 +55,7 @@ y_ += df_[k_].values[0] ys.append(y_ * 0.5) - + ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.) @@ -67,7 +67,7 @@ ax.set_xscale('log') if logscale_plot1: ax.set_yscale('log') - + ax.tick_params(axis='x', labelsize=11) ax.tick_params(axis='y', labelsize=11) @@ -91,7 +91,7 @@ ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'), ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), ]: - + xs, ys = [], [] df = rdf[rdf.batch_size == batch_size] for embed_dim in dims_to_consider: @@ -133,4 +133,3 @@ plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight') - diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py index d70df0386..c4f3cd4c6 100644 --- a/benchmarking/switchback/speed_benchmark.py +++ b/benchmarking/switchback/speed_benchmark.py @@ -42,7 +42,7 @@ def get_time(k, fn, info_dict): for dim in [1024, 1280, 1408, 1664, 2048, 4096]: # note "batch_size" is actually "batch_size * embed_dim", which is why it's large for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]: - + # switch switches dim_in and dim_out for switch in [False, True]: @@ -62,7 +62,7 @@ def get_time(k, fn, info_dict): x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda() g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda() w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda() - + x_int8 = x.clone().to(torch.int8) g_int8 = g.clone().to(torch.int8) w_int8 = w.clone().to(torch.int8) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index a34385b1f..0db9df343 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -210,7 +210,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: if path.exists(): existent_directories.add(path) except PermissionError: - # Handle the PermissionError first as it is a subtype of OSError + # Handle the PermissionError first as it is a subtype of OSError # https://docs.python.org/3/library/exceptions.html#exception-hierarchy pass except OSError as exc: diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 9ea5812ea..17383eed5 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -35,4 +35,3 @@ class PagedAdamW32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) - diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index e515bfeff..7d869e39a 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -83,7 +83,7 @@ def backward(ctx, grad_output): # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) # not supported by PyTorch. TODO: create work-around - if req_gradA: + if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: @@ -167,7 +167,7 @@ def backward(ctx, grad_output): # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) # not supported by PyTorch. TODO: create work-around - if req_gradA: + if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py index daa59da9c..3d7529852 100644 --- a/bitsandbytes/triton/dequantize_rowwise.py +++ b/bitsandbytes/triton/dequantize_rowwise.py @@ -50,7 +50,7 @@ def _dequantize_rowwise( max_val = tl.load(state_x + pid) output = max_val * x * inv_127 tl.store(output_ptr + offsets, output, mask=row_mask) - + def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py index 1b80ab1a0..dc3047d7e 100644 --- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py @@ -120,7 +120,7 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - + acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py index 1f28b0d10..4881e1468 100644 --- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py @@ -119,7 +119,7 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - + acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py index fcadaba3e..e7961cf53 100644 --- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py +++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py @@ -54,7 +54,7 @@ def _quantize_columnwise_and_transpose( max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) output = tl.libdevice.llrint(127. * (x / max_val)) - new_start = pid * M + new_start = pid * M new_offsets = new_start + p2_arange tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) tl.store(output_maxs + pid, max_val) @@ -71,4 +71,3 @@ def quantize_columnwise_and_transpose(x: torch.Tensor): grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) return output, output_maxs - diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py index a73a5bbaa..5cf194744 100644 --- a/bitsandbytes/triton/quantize_global.py +++ b/bitsandbytes/triton/quantize_global.py @@ -59,27 +59,27 @@ def quantize_global(x: torch.Tensor): key=['M', 'N'] ) @triton.jit - def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, - BLOCK_M : tl.constexpr, - BLOCK_N : tl.constexpr, + def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, + BLOCK_M : tl.constexpr, + BLOCK_N : tl.constexpr, GROUP_M : tl.constexpr): pid = tl.program_id(0) grid_m = (M + BLOCK_M - 1) // BLOCK_M grid_n = (N + BLOCK_N - 1) // BLOCK_N - + width = GROUP_M * grid_n group_id = pid // width group_size = min(grid_m - group_id * GROUP_M, GROUP_M) pid_m = group_id * GROUP_M + (pid % group_size) pid_n = (pid % width) // group_size - + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) mask = (rm < M)[:, None] & (rn < N)[None, :] a = tl.load(A, mask=mask) absmax_inv = tl.load(absmax_inv_ptr) - + # rematerialize to save registers rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) @@ -95,12 +95,11 @@ def quantize_global_transpose(input): absmax_inv = 1./ absmax M, N = input.shape out = torch.empty(N, M, device='cuda', dtype=torch.int8) - + assert out.size(0) == N and out.size(1) == M assert input.stride(0) == 1 or input.stride(1) == 1 assert out.stride(0) == 1 or out.stride(1) == 1 - + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) return out, absmax - diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py index fce464b19..078f4aa2d 100644 --- a/bitsandbytes/triton/quantize_rowwise.py +++ b/bitsandbytes/triton/quantize_rowwise.py @@ -46,7 +46,7 @@ def _quantize_rowwise( offsets = block_start + arange row_mask = arange < BLOCK_SIZE x = tl.load(x_ptr + offsets, mask=row_mask) - + abs_x = tl.abs(x) max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) output = tl.libdevice.llrint(127. * (x / max_val)) @@ -64,4 +64,3 @@ def quantize_rowwise(x: torch.Tensor): grid = lambda meta: (x.shape[0],) _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) return output, output_maxs - diff --git a/compile_from_source.md b/compile_from_source.md index 23afe1591..6310fd6c6 100644 --- a/compile_from_source.md +++ b/compile_from_source.md @@ -12,10 +12,10 @@ You can install CUDA locally without sudo by following the following steps: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. @@ -37,4 +37,3 @@ If you have problems compiling the library with these instructions from source, ## Compilation with Kepler Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler` - diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 0fff83665..f117547ed 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -110,7 +110,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax) return 1.00000000f*absmax*sign; // 1011 else return 0.66666667f*absmax*sign; // 1010 - else + else if((val & 0b0001) == 1) // 100 return 5.208333333e-03f*absmax*sign; // 1001 else @@ -174,36 +174,36 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -211,12 +211,12 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -229,36 +229,36 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -266,12 +266,12 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -1863,7 +1863,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps); //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val; g_val *= gnorm_scale; - + s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val)); s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE]; @@ -3069,7 +3069,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// use k warps per thread block //// 1. threadblock use read-only cache to read in register tile for A into shared memory //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments -//// 3. each warp reads a segment of values 16x32 from B +//// 3. each warp reads a segment of values 16x32 from B //// 4. do dequantization from register of B into second pair of registers //// 5. store (4) into fragment //// 6. matmul aggregate into fragment C @@ -3531,7 +3531,7 @@ template __global__ void kgemm_4bit_inference(int M, i template __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, T * out, int lda, int ldb, int ldc, int blocksize) { - // per threadblock: + // per threadblock: // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps] // 4 warps -> 4 loads per iter // 1x32 * 32x4 -> 1x4 outputs per thread block @@ -3764,7 +3764,7 @@ template __global__ void kfunc(T *A, T *B, T value, long { switch(FUNC) { - case FILL: + case FILL: A[i] = (T)value; break; case ARANGE: diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 865e4b6d5..087ae3921 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -389,7 +389,7 @@ extern "C" int hasPrefetch = 0; CUDA_CHECK_RETURN(cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // 40ns overhead if (hasPrefetch == 0) return; - + CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 28da69eb0..043597177 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -1,8 +1,8 @@ -- sections: +- sections: - local: index title: Bits & Bytes - local: quickstart title: Quickstart - local: installation title: Installation - title: Get started \ No newline at end of file + title: Get started diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 68ad433e6..67c928309 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -149,10 +149,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: @@ -188,4 +188,4 @@ For 8-bit optimizers or quantization routines, please consider citing the follow journal={9th International Conference on Learning Representations, ICLR}, year={2022} } -``` \ No newline at end of file +``` diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 4dff2ba46..d1028c655 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -6,7 +6,7 @@ ## Minimal example -The following code illustrates the steps above. +The following code illustrates the steps above. ```python -``` \ No newline at end of file +``` diff --git a/environment.yml b/environment.yml index c0e07f153..9ab48dedc 100644 --- a/environment.yml +++ b/environment.yml @@ -42,4 +42,4 @@ dependencies: ## ENV UPDATE: # # add new packages to environment.yml, then: -# mamba env update -n bnb -f environment.yml \ No newline at end of file +# mamba env update -n bnb -f environment.yml diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py index dc80a44db..2cee48e8e 100644 --- a/examples/int8_inference_huggingface.py +++ b/examples/int8_inference_huggingface.py @@ -22,6 +22,3 @@ ) generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) - - - diff --git a/how_to_use_nonpytorch_cuda.md b/how_to_use_nonpytorch_cuda.md index b5f01fbe5..566b0170e 100644 --- a/how_to_use_nonpytorch_cuda.md +++ b/how_to_use_nonpytorch_cuda.md @@ -18,7 +18,7 @@ You can also install CUDA version that you need locally with a script provided b wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc diff --git a/install_cuda.py b/install_cuda.py index 77e258609..4b041b8d0 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -49,13 +49,13 @@ def install_cuda(version, base_path, download_path): # Install CUDA print(f"Installing CUDA version {version}...") install_command = [ - "bash", filepath, - "--no-drm", "--no-man-page", "--override", + "bash", filepath, + "--no-drm", "--no-man-page", "--override", "--toolkitpath=" + install_path, "--toolkit", "--silent" ] print(f"Running command: {' '.join(install_command)}") - + try: subprocess.run(install_command, check=True) except subprocess.CalledProcessError as e: @@ -99,4 +99,4 @@ def main(): sys.exit(1) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/stale.py b/scripts/stale.py index c299643ae..613f5b7cb 100644 --- a/scripts/stale.py +++ b/scripts/stale.py @@ -55,4 +55,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 7e70a30ca..d01e5e9db 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -519,4 +519,3 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): torch.testing.assert_close( gradB1, gradB2, atol=0.18, rtol=0.3 ) - diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index 5e1a548e5..189aa75b5 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -19,11 +19,3 @@ def test_manual_override(requires_cuda): import bitsandbytes as bnb loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name #assert loaded_lib == 'libbitsandbytes_cuda122.so' - - - - - - - - diff --git a/tests/test_functional.py b/tests/test_functional.py index f4b8fca51..2d4e959ad 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2345,5 +2345,3 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant): torch.testing.assert_close(A, C2) #torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001) #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080) - - diff --git a/tests/test_generation.py b/tests/test_generation.py index 9ed30cd2a..b05749bf8 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -120,6 +120,3 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype): for out in outputs: print(out) raise ValueError(f'Failure count: {failure_count}/{n_cases}') - - - diff --git a/tests/test_modules.py b/tests/test_modules.py index 1cb04044f..32d90938d 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -637,6 +637,3 @@ def test_4bit_warnings(): net(inp) assert len(record) == 2 - - - diff --git a/tests/test_triton.py b/tests/test_triton.py index 943db067a..218a533d5 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -58,4 +58,3 @@ def test_switchback(vector_wise_quantization): print('GX1', err_sb, err_baseline) assert err_sb < 2 * err_baseline -