Skip to content

Commit

Permalink
Merge pull request #1128 from akx/type-fixes
Browse files Browse the repository at this point in the history
Minor type/doc fixes
  • Loading branch information
Titus-von-Koeller authored Mar 14, 2024
2 parents b03ce0e + 0c6dda0 commit bf01538
Show file tree
Hide file tree
Showing 12 changed files with 82 additions and 82 deletions.
14 changes: 7 additions & 7 deletions bitsandbytes/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -1618,18 +1618,18 @@ def optimizer_update_8bit(
g: Tensor,
p: Tensor,
state1: Tensor,
state2: Tensor,
state2: Optional[torch.Tensor],
beta1: float,
beta2: float,
eps: float,
step: int,
lr: float,
qmap1: Tensor,
qmap2: Tensor,
qmap2: Optional[torch.Tensor],
max1: Tensor,
max2: Tensor,
max2: Optional[torch.Tensor],
new_max1: Tensor,
new_max2: Tensor,
new_max2: Optional[torch.Tensor],
weight_decay: float = 0.0,
gnorm_scale: float = 1.0,
unorm_vec: Optional[torch.Tensor] = None,
Expand Down Expand Up @@ -1751,16 +1751,16 @@ def optimizer_update_8bit_blockwise(
g: Tensor,
p: Tensor,
state1: Tensor,
state2: Tensor,
state2: Optional[torch.Tensor],
beta1: float,
beta2: float,
eps: float,
step: int,
lr: float,
qmap1: Tensor,
qmap2: Tensor,
qmap2: Optional[torch.Tensor],
absmax1: Tensor,
absmax2: Tensor,
absmax2: Optional[torch.Tensor],
weight_decay: float = 0.0,
gnorm_scale: float = 1.0,
skip_zeros=False,
Expand Down
8 changes: 4 additions & 4 deletions bitsandbytes/nn/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,8 +658,8 @@ class Linear8bitLt(nn.Linear):

def __init__(
self,
input_features,
output_features,
input_features: int,
output_features: int,
bias=True,
has_fp16_weights=True,
memory_efficient_backward=False,
Expand All @@ -671,9 +671,9 @@ def __init__(
Initialize Linear8bitLt class.
Args:
input_features (`str`):
input_features (`int`):
Number of input features of the linear layer.
output_features (`str`):
output_features (`int`):
Number of output features of the linear layer.
bias (`bool`, defaults to `True`):
Whether the linear class uses the bias term as well.
Expand Down
12 changes: 6 additions & 6 deletions bitsandbytes/optim/adagrad.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def __init__(
The epsilon value prevents division by zero in the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -105,8 +105,8 @@ def __init__(
The epsilon value prevents division by zero in the optimizer.
optim_bits (`int`, defaults to 8):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -173,8 +173,8 @@ def __init__(
The epsilon value prevents division by zero in the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down
24 changes: 12 additions & 12 deletions bitsandbytes/optim/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -108,8 +108,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -169,8 +169,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -230,8 +230,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -291,8 +291,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -352,8 +352,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down
24 changes: 12 additions & 12 deletions bitsandbytes/optim/adamw.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -100,8 +100,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -161,8 +161,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -221,8 +221,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -281,8 +281,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -341,8 +341,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down
12 changes: 6 additions & 6 deletions bitsandbytes/optim/lamb.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ def __init__(
Whether to use the AdamW variant.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -109,8 +109,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
adam_w_mode (`bool`, defaults to `True`):
Whether to use the AdamW variant.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -173,8 +173,8 @@ def __init__(
Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
adam_w_mode (`bool`, defaults to `True`):
Whether to use the AdamW variant.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down
12 changes: 6 additions & 6 deletions bitsandbytes/optim/lars.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ def __init__(
Whether to use Nesterov momentum.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -98,8 +98,8 @@ def __init__(
The weight decay value for the optimizer.
nesterov (`bool`, defaults to `False`):
Whether to use Nesterov momentum.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -155,8 +155,8 @@ def __init__(
The weight decay value for the optimizer.
nesterov (`bool`, defaults to `False`):
Whether to use Nesterov momentum.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down
24 changes: 12 additions & 12 deletions bitsandbytes/optim/lion.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def __init__(
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -85,8 +85,8 @@ def __init__(
The beta values are the decay rates of the first and second-order moment of the optimizer.
weight_decay (`float`, defaults to 0):
The weight decay value for the optimizer.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -137,8 +137,8 @@ def __init__(
The beta values are the decay rates of the first and second-order moment of the optimizer.
weight_decay (`float`, defaults to 0):
The weight decay value for the optimizer.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -191,8 +191,8 @@ def __init__(
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -242,8 +242,8 @@ def __init__(
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down Expand Up @@ -293,8 +293,8 @@ def __init__(
The weight decay value for the optimizer.
optim_bits (`int`, defaults to 32):
The number of bits of the optimizer state.
args (`dict`, defaults to `None`):
A dictionary with additional arguments.
args (`object`, defaults to `None`):
An object with additional arguments.
min_8bit_size (`int`, defaults to 4096):
The minimum number of elements of the parameter tensors for 8-bit optimization.
percentile_clipping (`int`, defaults to 100):
Expand Down
Loading

0 comments on commit bf01538

Please sign in to comment.