diff --git a/keras/initializers/__init__.py b/keras/initializers/__init__.py
index 8ddb3ad78d9..8968dbf1899 100644
--- a/keras/initializers/__init__.py
+++ b/keras/initializers/__init__.py
@@ -67,7 +67,8 @@ def populate_deserializable_objects():
     LOCAL.ALL_OBJECTS["ZerosV2"] = initializers_v2.Zeros
 
     # Out of an abundance of caution we also include these aliases that have
-    # a non-zero probability of having been included in saved configs in the past.
+    # a non-zero probability of having been included in saved configs in the
+    # past.
     LOCAL.ALL_OBJECTS["glorot_normalV2"] = initializers_v2.GlorotNormal
     LOCAL.ALL_OBJECTS["glorot_uniformV2"] = initializers_v2.GlorotUniform
     LOCAL.ALL_OBJECTS["he_normalV2"] = initializers_v2.HeNormal
@@ -150,16 +151,16 @@ def deserialize(config, custom_objects=None):
 def get(identifier):
     """Retrieve a Keras initializer by the identifier.
 
-    The `identifier` may be the string name of a initializers function or class (
-    case-sensitively).
+    The `identifier` may be the string name of a initializers function or class
+    (case-sensitively).
 
     >>> identifier = 'Ones'
     >>> tf.keras.initializers.deserialize(identifier)
     <...keras.initializers.initializers_v2.Ones...>
 
     You can also specify `config` of the initializer to this function by passing
-    dict containing `class_name` and `config` as an identifier. Also note that the
-    `class_name` must map to a `Initializer` class.
+    dict containing `class_name` and `config` as an identifier. Also note that
+    the `class_name` must map to a `Initializer` class.
 
     >>> cfg = {'class_name': 'Ones', 'config': {}}
     >>> tf.keras.initializers.deserialize(cfg)
diff --git a/keras/initializers/initializers_test.py b/keras/initializers/initializers_test.py
index 14baef19f6b..c203fded395 100644
--- a/keras/initializers/initializers_test.py
+++ b/keras/initializers/initializers_test.py
@@ -65,8 +65,8 @@ def _runner(
         target_max=None,
         target_min=None,
     ):
-        # The global seed is set so that we can get the same random streams between
-        # eager and graph mode when stateful op is used.
+        # The global seed is set so that we can get the same random streams
+        # between eager and graph mode when stateful op is used.
         tf.random.set_seed(1337)
         variable = backend.variable(init(shape))
         output = backend.get_value(variable)
@@ -314,8 +314,9 @@ def test_partition(self, initializer_cls, kwargs):
             self.assertEqual(result.shape, (2, 2))
 
             if hasattr(initializer, "seed"):
-                # Make sure the result are different when the partition_shape is same,
-                # but partition_offset is different, for random related initializers.
+                # Make sure the result are different when the partition_shape is
+                # same, but partition_offset is different, for random related
+                # initializers.
                 result_2 = initializer(
                     shape=(4, 2),
                     partition_shape=(2, 2),
@@ -325,9 +326,11 @@ def test_partition(self, initializer_cls, kwargs):
 
                 # Make sure initializer produce same result when provide same
                 # partition offset.
-                # TODO(scottzhu): Enable this assert when initializer is fully stateless
+                # TODO(scottzhu): Enable this assert when initializer is fully
+                # stateless
                 # result_3 = initializer(
-                #     shape=(4, 2), partition_shape=(2, 2), partition_offset=(1, 0))
+                #     shape=(4, 2), partition_shape=(2, 2), partition_offset=(1,
+                #     0))
                 # self.assertAllClose(result_2, result_3)
 
     @parameterized.named_parameters(
diff --git a/keras/initializers/initializers_v1.py b/keras/initializers/initializers_v1.py
index 2a2d271812d..068e2e31fa3 100644
--- a/keras/initializers/initializers_v1.py
+++ b/keras/initializers/initializers_v1.py
@@ -71,8 +71,8 @@ class RandomNormal(tf.compat.v1.random_normal_initializer):
     Args:
       mean: a python scalar or a scalar tensor. Mean of the random values to
         generate.
-      stddev: a python scalar or a scalar tensor. Standard deviation of the random
-        values to generate.
+      stddev: a python scalar or a scalar tensor. Standard deviation of the
+        random values to generate.
       seed: A Python integer. Used to create random seeds. See
         `tf.compat.v1.set_random_seed` for behavior.
       dtype: Default data type, used if no `dtype` argument is provided when
diff --git a/keras/initializers/initializers_v2.py b/keras/initializers/initializers_v2.py
index 7af7afb2be4..368b5987d3f 100644
--- a/keras/initializers/initializers_v2.py
+++ b/keras/initializers/initializers_v2.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Keras initializers for TF 2."""
-# pylint: disable=g-classes-have-attributes, missing-docstring, g-direct-tensorflow-import
 
 import math
 
@@ -65,10 +64,10 @@ def get_config(self):  # To support serialization
         return {"mean": self.mean, "stddev": self.stddev}
     ```
 
-    Note that we don't have to implement `from_config` in the example above since
-    the constructor arguments of the class the keys in the config returned by
-    `get_config` are the same. In this case, the default `from_config`
-    works fine.
+    Note that we don't have to implement `from_config` in the example above
+    since the constructor arguments of the class the keys in the config returned
+    by `get_config` are the same. In this case, the default `from_config` works
+    fine.
     """
 
     def __call__(self, shape, dtype=None, **kwargs):
@@ -135,10 +134,10 @@ def __call__(self, shape, dtype=None, **kwargs):
 
         Args:
           shape: Shape of the tensor.
-          dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
-           supported. If not specified, `tf.keras.backend.floatx()` is used,
-           which default to `float32` unless you configured it otherwise
-           (via `tf.keras.backend.set_floatx(float_dtype)`).
+          dtype: Optional dtype of the tensor. Only numeric or boolean dtypes
+            are supported. If not specified, `tf.keras.backend.floatx()` is
+            used, which default to `float32` unless you configured it otherwise
+            (via `tf.keras.backend.set_floatx(float_dtype)`).
           **kwargs: Additional keyword arguments.
         """
         _validate_kwargs(self.__class__.__name__, kwargs)
@@ -177,10 +176,10 @@ def __call__(self, shape, dtype=None, **kwargs):
 
         Args:
           shape: Shape of the tensor.
-          dtype: Optional dtype of the tensor. Only numeric or boolean dtypes are
-           supported. If not specified, `tf.keras.backend.floatx()` is used,
-           which default to `float32` unless you configured it otherwise
-           (via `tf.keras.backend.set_floatx(float_dtype)`).
+          dtype: Optional dtype of the tensor. Only numeric or boolean dtypes
+            are supported. If not specified, `tf.keras.backend.floatx()` is
+            used, which default to `float32` unless you configured it otherwise
+            (via `tf.keras.backend.set_floatx(float_dtype)`).
           **kwargs: Additional keyword arguments.
         """
         _validate_kwargs(self.__class__.__name__, kwargs)
@@ -279,10 +278,9 @@ class RandomUniform(Initializer):
       maxval: A python scalar or a scalar tensor. Upper bound of the range of
         random values to generate (exclusive).
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
     """
 
     def __init__(self, minval=-0.05, maxval=0.05, seed=None):
@@ -356,13 +354,12 @@ class RandomNormal(Initializer):
     Args:
       mean: a python scalar or a scalar tensor. Mean of the random values to
         generate.
-      stddev: a python scalar or a scalar tensor. Standard deviation of the random
-        values to generate.
+      stddev: a python scalar or a scalar tensor. Standard deviation of the
+        random values to generate.
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
     """
 
     def __init__(self, mean=0.0, stddev=0.05, seed=None):
@@ -377,8 +374,8 @@ def __call__(self, shape, dtype=None, **kwargs):
         Args:
           shape: Shape of the tensor.
           dtype: Optional dtype of the tensor. Only floating point types are
-            supported. If not specified, `tf.keras.backend.floatx()` is used, which
-            default to `float32` unless you configured it otherwise (via
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
             `tf.keras.backend.set_floatx(float_dtype)`)
           **kwargs: Additional keyword arguments.
         """
@@ -443,10 +440,9 @@ class TruncatedNormal(Initializer):
       stddev: a python scalar or a scalar tensor. Standard deviation of the
         random values to generate before truncation.
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
     """
 
     def __init__(self, mean=0.0, stddev=0.05, seed=None):
@@ -461,8 +457,8 @@ def __call__(self, shape, dtype=None, **kwargs):
         Args:
           shape: Shape of the tensor.
           dtype: Optional dtype of the tensor. Only floating point types are
-            supported. If not specified, `tf.keras.backend.floatx()` is used, which
-            default to `float32` unless you configured it otherwise (via
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
             `tf.keras.backend.set_floatx(float_dtype)`)
           **kwargs: Additional keyword arguments.
         """
@@ -507,9 +503,9 @@ class VarianceScaling(Initializer):
     `tf.keras.initializers.variance_scaling`.
 
     With `distribution="truncated_normal" or "untruncated_normal"`, samples are
-    drawn from a truncated/untruncated normal distribution with a mean of zero and
-    a standard deviation (after truncation, if used) `stddev = sqrt(scale / n)`,
-    where `n` is:
+    drawn from a truncated/untruncated normal distribution with a mean of zero
+    and a standard deviation (after truncation, if used) `stddev = sqrt(scale /
+    n)`, where `n` is:
 
     - number of input units in the weight tensor, if `mode="fan_in"`
     - number of output units, if `mode="fan_out"`
@@ -536,10 +532,9 @@ class VarianceScaling(Initializer):
       distribution: Random distribution to use. One of "truncated_normal",
         "untruncated_normal" and  "uniform".
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
     """
 
     def __init__(
@@ -585,8 +580,8 @@ def __call__(self, shape, dtype=None, **kwargs):
         Args:
           shape: Shape of the tensor.
           dtype: Optional dtype of the tensor. Only floating point types are
-            supported. If not specified, `tf.keras.backend.floatx()` is used, which
-            default to `float32` unless you configured it otherwise (via
+            supported. If not specified, `tf.keras.backend.floatx()` is used,
+            which default to `float32` unless you configured it otherwise (via
             `tf.keras.backend.set_floatx(float_dtype)`)
           **kwargs: Additional keyword arguments.
         """
@@ -621,7 +616,8 @@ def _generate_init_val(self, shape, dtype, nonce):
         else:
             scale /= max(1.0, (fan_in + fan_out) / 2.0)
         if self.distribution == "truncated_normal":
-            # constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+            # constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0.,
+            # scale=1.)
             stddev = math.sqrt(scale) / 0.87962566103423978
             return self._random_generator.truncated_normal(
                 shape, 0.0, stddev, dtype, nonce
@@ -654,11 +650,11 @@ class Orthogonal(Initializer):
 
     Also available via the shortcut function `tf.keras.initializers.orthogonal`.
 
-    If the shape of the tensor to initialize is two-dimensional, it is initialized
-    with an orthogonal matrix obtained from the QR decomposition of a matrix of
-    random numbers drawn from a normal distribution.
-    If the matrix has fewer rows than columns then the output will have orthogonal
-    rows. Otherwise, the output will have orthogonal columns.
+    If the shape of the tensor to initialize is two-dimensional, it is
+    initialized with an orthogonal matrix obtained from the QR decomposition of
+    a matrix of random numbers drawn from a normal distribution. If the matrix
+    has fewer rows than columns then the output will have orthogonal rows.
+    Otherwise, the output will have orthogonal columns.
 
     If the shape of the tensor to initialize is more than two-dimensional,
     a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
@@ -678,10 +674,9 @@ class Orthogonal(Initializer):
     Args:
       gain: multiplicative factor to apply to the orthogonal matrix
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
 
     References:
       - [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
@@ -823,8 +818,8 @@ class GlorotUniform(VarianceScaling):
     `tf.keras.initializers.glorot_uniform`.
 
     Draws samples from a uniform distribution within `[-limit, limit]`, where
-    `limit = sqrt(6 / (fan_in + fan_out))` (`fan_in` is the number of input units
-    in the weight tensor and `fan_out` is the number of output units).
+    `limit = sqrt(6 / (fan_in + fan_out))` (`fan_in` is the number of input
+    units in the weight tensor and `fan_out` is the number of output units).
 
     Examples:
 
@@ -838,10 +833,9 @@ class GlorotUniform(VarianceScaling):
 
     Args:
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
 
     References:
       - [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
@@ -865,10 +859,10 @@ class GlorotNormal(VarianceScaling):
     Also available via the shortcut function
     `tf.keras.initializers.glorot_normal`.
 
-    Draws samples from a truncated normal distribution centered on 0 with `stddev
-    = sqrt(2 / (fan_in + fan_out))` where `fan_in` is the number of input units in
-    the weight tensor and `fan_out` is the number of output units in the weight
-    tensor.
+    Draws samples from a truncated normal distribution centered on 0 with
+    `stddev = sqrt(2 / (fan_in + fan_out))` where `fan_in` is the number of
+    input units in the weight tensor and `fan_out` is the number of output units
+    in the weight tensor.
 
     Examples:
 
@@ -882,10 +876,9 @@ class GlorotNormal(VarianceScaling):
 
     Args:
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
 
     References:
       - [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
@@ -916,9 +909,9 @@ class LecunNormal(VarianceScaling):
     the Initializer object, without knowing the shape and dtype of the variable
     being initialized.
 
-    Draws samples from a truncated normal distribution centered on 0 with `stddev
-    = sqrt(1 / fan_in)` where `fan_in` is the number of input units in the weight
-    tensor.
+    Draws samples from a truncated normal distribution centered on 0 with
+    `stddev = sqrt(1 / fan_in)` where `fan_in` is the number of input units in
+    the weight tensor.
 
     Examples:
 
@@ -932,10 +925,9 @@ class LecunNormal(VarianceScaling):
 
     Args:
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
 
     References:
       - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
@@ -959,8 +951,8 @@ class LecunUniform(VarianceScaling):
      Also available via the shortcut function
     `tf.keras.initializers.lecun_uniform`.
 
-    Draws samples from a uniform distribution within `[-limit, limit]`,
-    where `limit = sqrt(3 / fan_in)` (`fan_in` is the number of input units in the
+    Draws samples from a uniform distribution within `[-limit, limit]`, where
+    `limit = sqrt(3 / fan_in)` (`fan_in` is the number of input units in the
     weight tensor).
 
     Examples:
@@ -975,10 +967,9 @@ class LecunUniform(VarianceScaling):
 
     Args:
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
 
     References:
       - [Klambauer et al., 2017](https://arxiv.org/abs/1706.02515)
@@ -1003,8 +994,8 @@ class HeNormal(VarianceScaling):
     `tf.keras.initializers.he_normal`.
 
     It draws samples from a truncated normal distribution centered on 0 with
-    `stddev = sqrt(2 / fan_in)` where `fan_in` is the number of input units in the
-    weight tensor.
+    `stddev = sqrt(2 / fan_in)` where `fan_in` is the number of input units in
+    the weight tensor.
 
     Examples:
 
@@ -1018,10 +1009,9 @@ class HeNormal(VarianceScaling):
 
     Args:
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
 
     References:
       - [He et al., 2015](https://arxiv.org/abs/1502.01852)
@@ -1061,10 +1051,9 @@ class HeUniform(VarianceScaling):
 
     Args:
       seed: A Python integer. Used to make the behavior of the initializer
-        deterministic. Note that a seeded
-        initializer will not produce the same random values across multiple calls,
-        but multiple initializers will produce the same sequence when constructed
-        with the same seed value.
+        deterministic. Note that a seeded initializer will not produce the same
+        random values across multiple calls, but multiple initializers will
+        produce the same sequence when constructed with the same seed value.
 
     References:
       - [He et al., 2015](https://arxiv.org/abs/1502.01852)
@@ -1152,8 +1141,8 @@ def _ensure_keras_seeded():
     """Make sure the keras.backend global seed generator is set.
 
     This is important for DTensor use case to ensure that each client are
-    initialized with same seed for tf.random.Generator, so that the value created
-    are in sync among all the clients.
+    initialized with same seed for tf.random.Generator, so that the value
+    created are in sync among all the clients.
     """
     if not getattr(
         backend._SEED_GENERATOR, "generator", None