From 8706830f56e339d2f2eb4baaf43e0b08bcd75cd5 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 13 Mar 2024 17:53:28 +0200
Subject: [PATCH 1/3] Fix some bad types

---
 bitsandbytes/nn/modules.py | 8 ++++----
 bitsandbytes/utils.py      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index e1cc6600d..ec14e5940 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -658,8 +658,8 @@ class Linear8bitLt(nn.Linear):
 
     def __init__(
         self,
-        input_features,
-        output_features,
+        input_features: int,
+        output_features: int,
         bias=True,
         has_fp16_weights=True,
         memory_efficient_backward=False,
@@ -671,9 +671,9 @@ def __init__(
         Initialize Linear8bitLt class.
 
         Args:
-            input_features (`str`):
+            input_features (`int`):
                 Number of input features of the linear layer.
-            output_features (`str`):
+            output_features (`int`):
                 Number of output features of the linear layer.
             bias (`bool`, defaults to `True`):
                 Whether the linear class uses the bias term as well.
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 48c7fc82d..0229e59e2 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -140,7 +140,7 @@ def replace_linear(
             List of modules names not to convert. Defaults to `lm_head`.
         copy_weights (`bool`):
             Copy the weights from the old linear module to the new one
-        post_processing_fun_name (`str`):
+        post_processing_function (`str`):
             A function name of the replacement linear class that is called
             after processing.
     """

From 3ec3dd26655927b109b314545f3448516d69a770 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 13 Mar 2024 17:56:49 +0200
Subject: [PATCH 2/3] Fix type documentation for optimizer `args`

---
 bitsandbytes/optim/adagrad.py   | 12 ++++++------
 bitsandbytes/optim/adam.py      | 24 ++++++++++++------------
 bitsandbytes/optim/adamw.py     | 24 ++++++++++++------------
 bitsandbytes/optim/lamb.py      | 12 ++++++------
 bitsandbytes/optim/lars.py      | 12 ++++++------
 bitsandbytes/optim/lion.py      | 24 ++++++++++++------------
 bitsandbytes/optim/optimizer.py |  8 ++++----
 bitsandbytes/optim/rmsprop.py   | 12 ++++++------
 bitsandbytes/optim/sgd.py       | 12 ++++++------
 9 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py
index aace548fa..7459dece1 100644
--- a/bitsandbytes/optim/adagrad.py
+++ b/bitsandbytes/optim/adagrad.py
@@ -38,8 +38,8 @@ def __init__(
                 The epsilon value prevents division by zero in the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -105,8 +105,8 @@ def __init__(
                 The epsilon value prevents division by zero in the optimizer.
             optim_bits (`int`, defaults to 8):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -173,8 +173,8 @@ def __init__(
                 The epsilon value prevents division by zero in the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py
index d8ffca63e..740db26ac 100644
--- a/bitsandbytes/optim/adam.py
+++ b/bitsandbytes/optim/adam.py
@@ -47,8 +47,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -108,8 +108,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -169,8 +169,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -230,8 +230,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -291,8 +291,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -352,8 +352,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
index fa51458fd..4bf3f6436 100644
--- a/bitsandbytes/optim/adamw.py
+++ b/bitsandbytes/optim/adamw.py
@@ -39,8 +39,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -100,8 +100,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -161,8 +161,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -221,8 +221,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -281,8 +281,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -341,8 +341,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py
index ec829ee85..8d29cbbfe 100644
--- a/bitsandbytes/optim/lamb.py
+++ b/bitsandbytes/optim/lamb.py
@@ -45,8 +45,8 @@ def __init__(
                 Whether to use the AdamW variant.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -109,8 +109,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             adam_w_mode (`bool`, defaults to `True`):
                 Whether to use the AdamW variant.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -173,8 +173,8 @@ def __init__(
                 Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
             adam_w_mode (`bool`, defaults to `True`):
                 Whether to use the AdamW variant.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py
index 63c062988..90c3686fe 100644
--- a/bitsandbytes/optim/lars.py
+++ b/bitsandbytes/optim/lars.py
@@ -41,8 +41,8 @@ def __init__(
                 Whether to use Nesterov momentum.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -98,8 +98,8 @@ def __init__(
                 The weight decay value for the optimizer.
             nesterov (`bool`, defaults to `False`):
                 Whether to use Nesterov momentum.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -155,8 +155,8 @@ def __init__(
                 The weight decay value for the optimizer.
             nesterov (`bool`, defaults to `False`):
                 Whether to use Nesterov momentum.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py
index 9f0f4a8a9..2e4163694 100644
--- a/bitsandbytes/optim/lion.py
+++ b/bitsandbytes/optim/lion.py
@@ -33,8 +33,8 @@ def __init__(
                 The weight decay value for the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -85,8 +85,8 @@ def __init__(
                 The beta values are the decay rates of the first and second-order moment of the optimizer.
             weight_decay (`float`, defaults to 0):
                 The weight decay value for the optimizer.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -137,8 +137,8 @@ def __init__(
                 The beta values are the decay rates of the first and second-order moment of the optimizer.
             weight_decay (`float`, defaults to 0):
                 The weight decay value for the optimizer.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -191,8 +191,8 @@ def __init__(
                 The weight decay value for the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -242,8 +242,8 @@ def __init__(
                 The weight decay value for the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -293,8 +293,8 @@ def __init__(
                 The weight decay value for the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index 43ebbb24d..f1e60e5e7 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -373,8 +373,8 @@ def __init__(
                 The weight decay value for the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -596,8 +596,8 @@ def __init__(
                 The weight decay value for the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py
index 659617654..25611309b 100644
--- a/bitsandbytes/optim/rmsprop.py
+++ b/bitsandbytes/optim/rmsprop.py
@@ -41,8 +41,8 @@ def __init__(
                 Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -104,8 +104,8 @@ def __init__(
                 Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -167,8 +167,8 @@ def __init__(
                 Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py
index 0f0b12e4b..ec18f036c 100644
--- a/bitsandbytes/optim/sgd.py
+++ b/bitsandbytes/optim/sgd.py
@@ -38,8 +38,8 @@ def __init__(
                 Whether to use Nesterov momentum.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -94,8 +94,8 @@ def __init__(
                 The weight decay value for the optimizer.
             nesterov (`bool`, defaults to `False`):
                 Whether to use Nesterov momentum.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):
@@ -150,8 +150,8 @@ def __init__(
                 The weight decay value for the optimizer.
             nesterov (`bool`, defaults to `False`):
                 Whether to use Nesterov momentum.
-            args (`dict`, defaults to `None`):
-                A dictionary with additional arguments.
+            args (`object`, defaults to `None`):
+                An object with additional arguments.
             min_8bit_size (`int`, defaults to 4096):
                 The minimum number of elements of the parameter tensors for 8-bit optimization.
             percentile_clipping (`int`, defaults to 100):

From 0c6dda0842a8ee463518aa547fa0e4ab36b233db Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 13 Mar 2024 18:10:10 +0200
Subject: [PATCH 3/3] Mark some optimizer update arguments as Noneable (they
 were being called with Nones)

---
 bitsandbytes/functional.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 8fa8f2f60..bb6a04892 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1618,18 +1618,18 @@ def optimizer_update_8bit(
     g: Tensor,
     p: Tensor,
     state1: Tensor,
-    state2: Tensor,
+    state2: Optional[torch.Tensor],
     beta1: float,
     beta2: float,
     eps: float,
     step: int,
     lr: float,
     qmap1: Tensor,
-    qmap2: Tensor,
+    qmap2: Optional[torch.Tensor],
     max1: Tensor,
-    max2: Tensor,
+    max2: Optional[torch.Tensor],
     new_max1: Tensor,
-    new_max2: Tensor,
+    new_max2: Optional[torch.Tensor],
     weight_decay: float = 0.0,
     gnorm_scale: float = 1.0,
     unorm_vec: Optional[torch.Tensor] = None,
@@ -1751,16 +1751,16 @@ def optimizer_update_8bit_blockwise(
     g: Tensor,
     p: Tensor,
     state1: Tensor,
-    state2: Tensor,
+    state2: Optional[torch.Tensor],
     beta1: float,
     beta2: float,
     eps: float,
     step: int,
     lr: float,
     qmap1: Tensor,
-    qmap2: Tensor,
+    qmap2: Optional[torch.Tensor],
     absmax1: Tensor,
-    absmax2: Tensor,
+    absmax2: Optional[torch.Tensor],
     weight_decay: float = 0.0,
     gnorm_scale: float = 1.0,
     skip_zeros=False,