From 8706830f56e339d2f2eb4baaf43e0b08bcd75cd5 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 17:53:28 +0200 Subject: [PATCH 1/3] Fix some bad types --- bitsandbytes/nn/modules.py | 8 ++++---- bitsandbytes/utils.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index e1cc6600d..ec14e5940 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -658,8 +658,8 @@ class Linear8bitLt(nn.Linear): def __init__( self, - input_features, - output_features, + input_features: int, + output_features: int, bias=True, has_fp16_weights=True, memory_efficient_backward=False, @@ -671,9 +671,9 @@ def __init__( Initialize Linear8bitLt class. Args: - input_features (`str`): + input_features (`int`): Number of input features of the linear layer. - output_features (`str`): + output_features (`int`): Number of output features of the linear layer. bias (`bool`, defaults to `True`): Whether the linear class uses the bias term as well. diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py index 48c7fc82d..0229e59e2 100644 --- a/bitsandbytes/utils.py +++ b/bitsandbytes/utils.py @@ -140,7 +140,7 @@ def replace_linear( List of modules names not to convert. Defaults to `lm_head`. copy_weights (`bool`): Copy the weights from the old linear module to the new one - post_processing_fun_name (`str`): + post_processing_function (`str`): A function name of the replacement linear class that is called after processing. """ From 3ec3dd26655927b109b314545f3448516d69a770 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 17:56:49 +0200 Subject: [PATCH 2/3] Fix type documentation for optimizer `args` --- bitsandbytes/optim/adagrad.py | 12 ++++++------ bitsandbytes/optim/adam.py | 24 ++++++++++++------------ bitsandbytes/optim/adamw.py | 24 ++++++++++++------------ bitsandbytes/optim/lamb.py | 12 ++++++------ bitsandbytes/optim/lars.py | 12 ++++++------ bitsandbytes/optim/lion.py | 24 ++++++++++++------------ bitsandbytes/optim/optimizer.py | 8 ++++---- bitsandbytes/optim/rmsprop.py | 12 ++++++------ bitsandbytes/optim/sgd.py | 12 ++++++------ 9 files changed, 70 insertions(+), 70 deletions(-) diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py index aace548fa..7459dece1 100644 --- a/bitsandbytes/optim/adagrad.py +++ b/bitsandbytes/optim/adagrad.py @@ -38,8 +38,8 @@ def __init__( The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -105,8 +105,8 @@ def __init__( The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 8): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -173,8 +173,8 @@ def __init__( The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py index d8ffca63e..740db26ac 100644 --- a/bitsandbytes/optim/adam.py +++ b/bitsandbytes/optim/adam.py @@ -47,8 +47,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -108,8 +108,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -169,8 +169,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -230,8 +230,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -291,8 +291,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -352,8 +352,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index fa51458fd..4bf3f6436 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -39,8 +39,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -100,8 +100,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -161,8 +161,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -221,8 +221,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -281,8 +281,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -341,8 +341,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py index ec829ee85..8d29cbbfe 100644 --- a/bitsandbytes/optim/lamb.py +++ b/bitsandbytes/optim/lamb.py @@ -45,8 +45,8 @@ def __init__( Whether to use the AdamW variant. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -109,8 +109,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. adam_w_mode (`bool`, defaults to `True`): Whether to use the AdamW variant. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -173,8 +173,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. adam_w_mode (`bool`, defaults to `True`): Whether to use the AdamW variant. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py index 63c062988..90c3686fe 100644 --- a/bitsandbytes/optim/lars.py +++ b/bitsandbytes/optim/lars.py @@ -41,8 +41,8 @@ def __init__( Whether to use Nesterov momentum. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -98,8 +98,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -155,8 +155,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index 9f0f4a8a9..2e4163694 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -33,8 +33,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -85,8 +85,8 @@ def __init__( The beta values are the decay rates of the first and second-order moment of the optimizer. weight_decay (`float`, defaults to 0): The weight decay value for the optimizer. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -137,8 +137,8 @@ def __init__( The beta values are the decay rates of the first and second-order moment of the optimizer. weight_decay (`float`, defaults to 0): The weight decay value for the optimizer. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -191,8 +191,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -242,8 +242,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -293,8 +293,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index 43ebbb24d..f1e60e5e7 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -373,8 +373,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -596,8 +596,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py index 659617654..25611309b 100644 --- a/bitsandbytes/optim/rmsprop.py +++ b/bitsandbytes/optim/rmsprop.py @@ -41,8 +41,8 @@ def __init__( Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -104,8 +104,8 @@ def __init__( Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -167,8 +167,8 @@ def __init__( Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py index 0f0b12e4b..ec18f036c 100644 --- a/bitsandbytes/optim/sgd.py +++ b/bitsandbytes/optim/sgd.py @@ -38,8 +38,8 @@ def __init__( Whether to use Nesterov momentum. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -94,8 +94,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -150,8 +150,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): From 0c6dda0842a8ee463518aa547fa0e4ab36b233db Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 18:10:10 +0200 Subject: [PATCH 3/3] Mark some optimizer update arguments as Noneable (they were being called with Nones) --- bitsandbytes/functional.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 8fa8f2f60..bb6a04892 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1618,18 +1618,18 @@ def optimizer_update_8bit( g: Tensor, p: Tensor, state1: Tensor, - state2: Tensor, + state2: Optional[torch.Tensor], beta1: float, beta2: float, eps: float, step: int, lr: float, qmap1: Tensor, - qmap2: Tensor, + qmap2: Optional[torch.Tensor], max1: Tensor, - max2: Tensor, + max2: Optional[torch.Tensor], new_max1: Tensor, - new_max2: Tensor, + new_max2: Optional[torch.Tensor], weight_decay: float = 0.0, gnorm_scale: float = 1.0, unorm_vec: Optional[torch.Tensor] = None, @@ -1751,16 +1751,16 @@ def optimizer_update_8bit_blockwise( g: Tensor, p: Tensor, state1: Tensor, - state2: Tensor, + state2: Optional[torch.Tensor], beta1: float, beta2: float, eps: float, step: int, lr: float, qmap1: Tensor, - qmap2: Tensor, + qmap2: Optional[torch.Tensor], absmax1: Tensor, - absmax2: Tensor, + absmax2: Optional[torch.Tensor], weight_decay: float = 0.0, gnorm_scale: float = 1.0, skip_zeros=False,