From 258e8667b9f0bb372ede85463f17b8e55f2ed9ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kaan=20B=C4=B1=C3=A7akc=C4=B1?= <kaan.dvlpr@gmail.com>
Date: Sat, 22 Apr 2023 10:39:14 +0100
Subject: [PATCH 1/4] Update docstring indendation to 4 spaces.

---
 keras/losses.py | 884 +++++++++++++++++++++++++-----------------------
 1 file changed, 455 insertions(+), 429 deletions(-)

diff --git a/keras/losses.py b/keras/losses.py
index 178cfb863bc..609e30c5c6a 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -44,7 +44,7 @@ class Loss:
 
     To be implemented by subclasses:
     * `call()`: Contains the logic for loss calculation using `y_true`,
-      `y_pred`.
+        `y_pred`.
 
     Example subclass implementation:
 
@@ -52,7 +52,7 @@ class Loss:
     class MeanSquaredError(Loss):
 
       def call(self, y_true, y_pred):
-        return tf.reduce_mean(tf.math.square(y_pred - y_true), axis=-1)
+          return tf.reduce_mean(tf.math.square(y_pred - y_true), axis=-1)
     ```
 
     When using a Loss under a `tf.distribute.Strategy`, except passing it
@@ -69,16 +69,17 @@ def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name=None):
         """Initializes `Loss` class.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-              https://www.tensorflow.org/tutorials/distribute/custom_training)
-              for more details.
-          name: Optional name for the instance.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
         """
         losses_utils.ReductionV2.validate(reduction)
         self.reduction = reduction
@@ -102,26 +103,26 @@ def __call__(self, y_true, y_pred, sample_weight=None):
         """Invokes the `Loss` instance.
 
         Args:
-          y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`, except
-            sparse loss functions such as sparse categorical crossentropy where
-            shape = `[batch_size, d0, .. dN-1]`
-          y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
-          sample_weight: Optional `sample_weight` acts as a coefficient for the
-            loss. If a scalar is provided, then the loss is simply scaled by the
-            given value. If `sample_weight` is a tensor of size `[batch_size]`,
-            then the total loss for each sample of the batch is rescaled by the
-            corresponding element in the `sample_weight` vector. If the shape of
-            `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be
-            broadcasted to this shape), then each loss element of `y_pred` is
-            scaled by the corresponding value of `sample_weight`. (Note
-            on`dN-1`: all loss functions reduce by 1 dimension, usually
-            axis=-1.)
+            y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`,
+                except sparse loss functions such as sparse categorical
+                crossentropy where shape = `[batch_size, d0, .. dN-1]`
+            y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
+            sample_weight: Optional `sample_weight` acts as a coefficient for
+                the loss. If a scalar is provided, then the loss is simply
+                scaled by the given value. If `sample_weight` is a tensor of
+                size `[batch_size]`, then the total loss for each sample of the
+                batch is rescaled by the corresponding element in the
+                `sample_weight` vector. If the shape of `sample_weight` is
+                `[batch_size, d0, .. dN-1]` (or can be broadcasted to this
+                shape), then each loss element of `y_pred` is scaled by the
+                corresponding value of `sample_weight`. (Note on`dN-1`: all loss
+                functions reduce by 1 dimension, usually axis=-1.)
 
         Returns:
-          Weighted loss float `Tensor`. If `reduction` is `NONE`, this has
-            shape `[batch_size, d0, .. dN-1]`; otherwise, it is scalar. (Note
-            `dN-1` because all loss functions reduce by 1 dimension, usually
-            axis=-1.)
+            Weighted loss float `Tensor`. If `reduction` is `NONE`, this has
+                shape `[batch_size, d0, .. dN-1]`; otherwise, it is scalar.
+                (Note `dN-1` because all loss functions reduce by 1 dimension,
+                usually axis=-1.)
 
         Raises:
           ValueError: If the shape of `sample_weight` is invalid.
@@ -183,13 +184,13 @@ def call(self, y_true, y_pred):
         """Invokes the `Loss` instance.
 
         Args:
-          y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`, except
-            sparse loss functions such as sparse categorical crossentropy where
-            shape = `[batch_size, d0, .. dN-1]`
-          y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
+            y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`,
+                except sparse loss functions such as sparse categorical
+                crossentropy where shape = `[batch_size, d0, .. dN-1]`
+            y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
 
         Returns:
-          Loss values with the shape `[batch_size, d0, .. dN-1]`.
+            Loss values with the shape `[batch_size, d0, .. dN-1]`.
         """
         raise NotImplementedError("Must be implemented in subclasses.")
 
@@ -229,19 +230,20 @@ def __init__(
         """Initializes `LossFunctionWrapper` class.
 
         Args:
-          fn: The loss function to wrap, with signature `fn(y_true, y_pred,
-            **kwargs)`.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance.
-          **kwargs: The keyword arguments that are passed on to `fn`.
+            fn: The loss function to wrap, with signature `fn(y_true, y_pred,
+                **kwargs)`.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+            **kwargs: The keyword arguments that are passed on to `fn`.
         """
         super().__init__(reduction=reduction, name=name)
         self.fn = fn
@@ -251,11 +253,11 @@ def call(self, y_true, y_pred):
         """Invokes the `LossFunctionWrapper` instance.
 
         Args:
-          y_true: Ground truth values.
-          y_pred: The predicted values.
+            y_true: Ground truth values.
+            y_pred: The predicted values.
 
         Returns:
-          Loss values per sample.
+            Loss values per sample.
         """
         if tf.is_tensor(y_pred) and tf.is_tensor(y_true):
             y_pred, y_true = losses_utils.squeeze_or_expand_dimensions(
@@ -343,17 +345,18 @@ def __init__(
         """Initializes `MeanSquaredError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to
-            'mean_squared_error'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_squared_error'.
         """
         super().__init__(mean_squared_error, name=name, reduction=reduction)
 
@@ -404,17 +407,18 @@ def __init__(
         """Initializes `MeanAbsoluteError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to
-            'mean_absolute_error'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_absolute_error'.
         """
         super().__init__(mean_absolute_error, name=name, reduction=reduction)
 
@@ -471,17 +475,18 @@ def __init__(
         """Initializes `MeanAbsolutePercentageError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to
-            'mean_absolute_percentage_error'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_absolute_percentage_error'.
         """
         super().__init__(
             mean_absolute_percentage_error, name=name, reduction=reduction
@@ -535,17 +540,18 @@ def __init__(
         """Initializes `MeanSquaredLogarithmicError` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to
-            'mean_squared_logarithmic_error'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to
+                'mean_squared_logarithmic_error'.
         """
         super().__init__(
             mean_squared_logarithmic_error, name=name, reduction=reduction
@@ -561,10 +567,10 @@ class BinaryCrossentropy(LossFunctionWrapper):
 
     - `y_true` (true label): This is either 0 or 1.
     - `y_pred` (predicted value): This is the model's prediction, i.e, a single
-      floating-point value which either represents a
-      [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
-      when `from_logits=True`) or a probability (i.e, value in [0., 1.] when
-      `from_logits=False`).
+        floating-point value which either represents a
+        [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
+        when `from_logits=True`) or a probability (i.e, value in [0., 1.] when
+        `from_logits=False`).
 
     **Recommended Usage:** (set `from_logits=True`)
 
@@ -572,8 +578,8 @@ class BinaryCrossentropy(LossFunctionWrapper):
 
     ```python
     model.compile(
-      loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-      ....
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        ....
     )
     ```
 
@@ -627,27 +633,28 @@ def __init__(
         """Initializes `BinaryCrossentropy` instance.
 
         Args:
-          from_logits: Whether to interpret `y_pred` as a tensor of
-            [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
-            assume that `y_pred` contains probabilities (i.e., values in [0,
-            1]).
-          label_smoothing: Float in [0, 1]. When 0, no smoothing occurs. When >
-            0, we compute the loss between the predicted labels and a smoothed
-            version of the true labels, where the smoothing squeezes the labels
-            towards 0.5.  Larger values of `label_smoothing` correspond to
-            heavier smoothing.
-          axis: The axis along which to compute crossentropy (the features
-            axis).  Defaults to -1.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Name for the op. Defaults to 'binary_crossentropy'.
+            from_logits: Whether to interpret `y_pred` as a tensor of
+                [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
+                assume that `y_pred` contains probabilities (i.e., values in [0,
+                1]).
+            label_smoothing: Float in [0, 1]. When 0, no smoothing occurs.
+                When > 0, we compute the loss between the predicted labels and a
+                smoothed version of the true labels, where the smoothing
+                squeezes the labels towards 0.5.  Larger values of
+                `label_smoothing` correspond to heavier smoothing.
+            axis: The axis along which to compute crossentropy (the features
+                axis).  Defaults to -1.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction option will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Name for the op. Defaults to 'binary_crossentropy'.
         """
         super().__init__(
             binary_crossentropy,
@@ -669,10 +676,10 @@ class BinaryFocalCrossentropy(LossFunctionWrapper):
 
     - `y_true` (true label): This is either 0 or 1.
     - `y_pred` (predicted value): This is the model's prediction, i.e, a single
-      floating-point value which either represents a
-      [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
-      when `from_logits=True`) or a probability (i.e, value in `[0., 1.]` when
-      `from_logits=False`).
+        floating-point value which either represents a
+        [logit](https://en.wikipedia.org/wiki/Logit), (i.e, value in [-inf, inf]
+        when `from_logits=True`) or a probability (i.e, value in [0., 1.] when
+        `from_logits=False`).
 
     According to [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf), it
     helps to apply a "focal factor" to down-weight easy examples and focus more
@@ -765,35 +772,35 @@ class BinaryFocalCrossentropy(LossFunctionWrapper):
 
 
     Args:
-      apply_class_balancing: A bool, whether to apply weight balancing on the
-        binary classes 0 and 1.
-      alpha: A weight balancing factor for class 1, default is `0.25` as
-        mentioned in reference [Lin et al., 2018](
-        https://arxiv.org/pdf/1708.02002.pdf).  The weight for class 0 is
-        `1.0 - alpha`.
-      gamma: A focusing parameter used to compute the focal factor, default is
-        `2.0` as mentioned in the reference
-        [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf).
-      from_logits: Whether to interpret `y_pred` as a tensor of
-        [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
-        assume that `y_pred` are probabilities (i.e., values in `[0, 1]`).
-      label_smoothing: Float in `[0, 1]`. When `0`, no smoothing occurs. When >
-        `0`, we compute the loss between the predicted labels and a smoothed
-        version of the true labels, where the smoothing squeezes the labels
-        towards `0.5`. Larger values of `label_smoothing` correspond to heavier
-        smoothing.
-      axis: The axis along which to compute crossentropy (the features axis).
-        Defaults to `-1`.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-        `tf.distribute.Strategy`, except via `Model.compile()` and
-        `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-        https://www.tensorflow.org/tutorials/distribute/custom_training)
-        for more details.
-      name: Name for the op. Defaults to 'binary_focal_crossentropy'.
+        apply_class_balancing: A bool, whether to apply weight balancing on the
+            binary classes 0 and 1.
+        alpha: A weight balancing factor for class 1, default is `0.25` as
+            mentioned in reference [Lin et al., 2018](
+            https://arxiv.org/pdf/1708.02002.pdf).  The weight for class 0 is
+            `1.0 - alpha`.
+        gamma: A focusing parameter used to compute the focal factor, default is
+            `2.0` as mentioned in the reference
+            [Lin et al., 2018](https://arxiv.org/pdf/1708.02002.pdf).
+        from_logits: Whether to interpret `y_pred` as a tensor of
+            [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
+            assume that `y_pred` are probabilities (i.e., values in `[0, 1]`).
+        label_smoothing: Float in `[0, 1]`. When `0`, no smoothing occurs.
+            When > `0`, we compute the loss between the predicted labels and a
+            smoothed version of the true labels, where the smoothing squeezes
+            the labels towards `0.5`. Larger values of `label_smoothing`
+            correspond to heavier smoothing.
+        axis: The axis along which to compute crossentropy (the features axis).
+            Defaults to `-1`.
+        reduction: Type of `tf.keras.losses.Reduction` to apply to
+            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
+            option will be determined by the usage context. For almost all cases
+            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+            `tf.distribute.Strategy`, except via `Model.compile()` and
+            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
+            will raise an error. Please see this custom training [tutorial](
+            https://www.tensorflow.org/tutorials/distribute/custom_training)
+            for more details.
+        name: Name for the op. Defaults to 'binary_focal_crossentropy'.
     """
 
     def __init__(
@@ -892,25 +899,26 @@ def __init__(
         """Initializes `CategoricalCrossentropy` instance.
 
         Args:
-          from_logits: Whether `y_pred` is expected to be a logits tensor. By
-            default, we assume that `y_pred` encodes a probability distribution.
-          label_smoothing: Float in [0, 1]. When > 0, label values are smoothed,
-            meaning the confidence on label values are relaxed. For example, if
-            `0.1`, use `0.1 / num_classes` for non-target labels and
-            `0.9 + 0.1 / num_classes` for target labels.
-          axis: The axis along which to compute crossentropy (the features
-            axis). Defaults to -1.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance.
-            Defaults to 'categorical_crossentropy'.
+            from_logits: Whether `y_pred` is expected to be a logits tensor. By
+                default, we assume that `y_pred` encodes a probability
+                distribution.
+            label_smoothing: Float in [0, 1]. When > 0, label values are
+                smoothed, meaning the confidence on label values are relaxed.
+                For example, if `0.1`, use `0.1 / num_classes` for non-target
+                labels and `0.9 + 0.1 / num_classes` for target labels.
+            axis: The axis along which to compute crossentropy (the features
+                axis). Defaults to -1.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+                Default value is `AUTO`. `AUTO` indicates that the reduction
+                option will be determined by the usage context. For almost all
+                cases this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+                `tf.distribute.Strategy`, except via `Model.compile()` and
+                `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
+                will raise an error. Please see this custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'categorical_crossentropy'.
         """
         super().__init__(
             categorical_crossentropy,
@@ -1119,24 +1127,26 @@ def __init__(
         """Initializes `SparseCategoricalCrossentropy` instance.
 
         Args:
-          from_logits: Whether `y_pred` is expected to be a logits tensor. By
-            default, we assume that `y_pred` encodes a probability distribution.
-          ignore_class: Optional integer. The ID of a class to be ignored during
-            loss computation. This is useful, for example, in segmentation
-            problems featuring a "void" class (commonly -1 or 255) in
-            segmentation maps.
-            By default (`ignore_class=None`), all classes are considered.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to
-            'sparse_categorical_crossentropy'.
+            from_logits: Whether `y_pred` is expected to be a logits tensor. By
+                default, we assume that `y_pred` encodes a probability
+                distribution.
+            ignore_class: Optional integer. The ID of a class to be ignored
+                during loss computation. This is useful, for example, in
+                segmentation problems featuring a "void" class (commonly -1 or
+                255) in segmentation maps.
+                By default (`ignore_class=None`), all classes are considered.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'sparse_categorical_crossentropy'.
         """
         super().__init__(
             sparse_categorical_crossentropy,
@@ -1192,16 +1202,17 @@ def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="hinge"):
         """Initializes `Hinge` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to 'hinge'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'hinge'.
         """
         super().__init__(hinge, name=name, reduction=reduction)
 
@@ -1253,16 +1264,17 @@ def __init__(
         """Initializes `SquaredHinge` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to 'squared_hinge'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'squared_hinge'.
         """
         super().__init__(squared_hinge, name=name, reduction=reduction)
 
@@ -1312,16 +1324,18 @@ def __init__(
         """Initializes `CategoricalHinge` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to 'categorical_hinge'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'categorical_hinge'.
         """
         super().__init__(categorical_hinge, name=name, reduction=reduction)
 
@@ -1368,16 +1382,17 @@ def __init__(self, reduction=losses_utils.ReductionV2.AUTO, name="poisson"):
         """Initializes `Poisson` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to 'poisson'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'poisson'.
         """
         super().__init__(poisson, name=name, reduction=reduction)
 
@@ -1427,16 +1442,17 @@ def __init__(
         """Initializes `LogCosh` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to 'log_cosh'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'log_cosh'.
         """
         super().__init__(log_cosh, name=name, reduction=reduction)
 
@@ -1487,16 +1503,18 @@ def __init__(
         """Initializes `KLDivergence` instance.
 
         Args:
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to 'kl_divergence'.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance.
+                Defaults to 'kl_divergence'.
         """
         super().__init__(kl_divergence, name=name, reduction=reduction)
 
@@ -1554,18 +1572,19 @@ def __init__(
         """Initializes `Huber` instance.
 
         Args:
-          delta: A float, the point where the Huber loss function changes from a
-            quadratic to linear.
-          reduction: Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: Optional name for the instance. Defaults to 'huber_loss'.
+            delta: A float, the point where the Huber loss function changes from
+                a quadratic to linear.
+            reduction: Type of `tf.keras.losses.Reduction` to apply to
+                loss. Default value is `AUTO`. `AUTO` indicates that the
+                reduction ption will be determined by the usage context. For
+                almost all cases this defaults to `SUM_OVER_BATCH_SIZE`. When
+                used under a `tf.distribute.Strategy`, except via
+                `Model.compile()` and `Model.fit()`, using `AUTO` or
+                `SUM_OVER_BATCH_SIZE` will raise an error. Please see this
+                custom training [tutorial](
+                https://www.tensorflow.org/tutorials/distribute/custom_training)
+                for more details.
+            name: Optional name for the instance. Defaults to 'huber_loss'.
         """
         super().__init__(huber, name=name, reduction=reduction, delta=delta)
 
@@ -1597,11 +1616,11 @@ def mean_squared_error(y_true, y_pred):
     ...     loss.numpy(), np.mean(np.square(y_true - y_pred), axis=-1))
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-      Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
+        Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -1612,15 +1631,15 @@ def _ragged_tensor_apply_loss(loss_fn, y_true, y_pred, y_pred_extra_dim=False):
     """Apply a loss function on a per batch basis.
 
     Args:
-      loss_fn: The loss function
-      y_true: truth values (RaggedTensor)
-      y_pred: predicted values (RaggedTensor)
-      y_pred_extra_dim: whether y_pred has an additional dimension compared to
+        loss_fn: The loss function
+        y_true: truth values (RaggedTensor)
+        y_pred: predicted values (RaggedTensor)
+        y_pred_extra_dim: whether y_pred has an additional dimension compared to
         y_true
 
     Returns:
-      Loss-function result. A dense tensor if the output has a single dimension
-      (per-batch loss value); a ragged tensor otherwise.
+        Loss-function result. A dense tensor if the output has a single
+        dimension (per-batch loss value); a ragged tensor otherwise.
     """
 
     def rt_is_equiv_dense(rt):
@@ -1630,7 +1649,7 @@ def rt_is_equiv_dense(rt):
            without loss of information.
 
         Args:
-          rt: RaggedTensor.
+            rt: RaggedTensor.
         """
         return tf.reduce_all(
             [
@@ -1702,14 +1721,15 @@ def _ragged_tensor_mse(y_true, y_pred):
     """Implements support for handling RaggedTensors.
 
     Args:
-      y_true: RaggedTensor truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: RaggedTensor predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: RaggedTensor truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: RaggedTensor predicted values.
+            shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-      Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
-      When the number of dimensions of the batch feature vector [d0, .. dN] is
-      greater than one the return value is a RaggedTensor. Otherwise a Dense
-      tensor with dimensions [batch_size] is returned.
+        Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
+        When the number of dimensions of the batch feature vector [d0, .. dN] is
+        greater than one the return value is a RaggedTensor. Otherwise, a Dense
+        tensor with dimensions [batch_size] is returned.
     """
     return _ragged_tensor_apply_loss(mean_squared_error, y_true, y_pred)
 
@@ -1738,11 +1758,11 @@ def mean_absolute_error(y_true, y_pred):
     ...     loss.numpy(), np.mean(np.abs(y_true - y_pred), axis=-1))
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-      Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`.
+        Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -1781,12 +1801,12 @@ def mean_absolute_percentage_error(y_true, y_pred):
     ...     100. * np.mean(np.abs((y_true - y_pred) / y_true), axis=-1))
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-      Mean absolute percentage error values. shape = `[batch_size, d0, ..
-      dN-1]`.
+        Mean absolute percentage error values. shape = `[batch_size, d0, ..
+        dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -1832,12 +1852,12 @@ def mean_squared_logarithmic_error(y_true, y_pred):
     ...         np.square(np.log(y_true + 1.) - np.log(y_pred + 1.)), axis=-1))
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-      Mean squared logarithmic error values. shape = `[batch_size, d0, ..
-      dN-1]`.
+        Mean squared logarithmic error values. shape = `[batch_size, d0, ..
+        dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -1890,13 +1910,13 @@ def squared_hinge(y_true, y_pred):
     ...     np.mean(np.square(np.maximum(1. - y_true * y_pred, 0.)), axis=-1))
 
     Args:
-      y_true: The ground truth values. `y_true` values are expected to be -1 or
-        1. If binary (0 or 1) labels are provided we will convert them to -1 or
-        1. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: The ground truth values. `y_true` values are expected to be -1
+            or 1. If binary (0 or 1) labels are provided we will convert them to
+            -1 or 1. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-       Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
+        Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -1924,13 +1944,13 @@ def hinge(y_true, y_pred):
     ...     np.mean(np.maximum(1. - y_true * y_pred, 0.), axis=-1))
 
     Args:
-      y_true: The ground truth values. `y_true` values are expected to be -1 or
-        1. If binary (0 or 1) labels are provided they will be converted to -1
-        or 1. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: The ground truth values. `y_true` values are expected to be -1
+            or 1. If binary (0 or 1) labels are provided we will convert them to
+            -1 or 1. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-      Hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
+        Hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -1958,12 +1978,12 @@ def categorical_hinge(y_true, y_pred):
     >>> assert np.array_equal(loss.numpy(), np.maximum(0., neg - pos + 1.))
 
     Args:
-      y_true: The ground truth values. `y_true` values are expected to be
-      either `{-1, +1}` or `{0, 1}` (i.e. a one-hot-encoded tensor).
-      y_pred: The predicted values.
+        y_true: The ground truth values. `y_true` values are expected to be
+        either `{-1, +1}` or `{0, 1}` (i.e. a one-hot-encoded tensor).
+        y_pred: The predicted values.
 
     Returns:
-      Categorical hinge loss values.
+        Categorical hinge loss values.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -1987,13 +2007,13 @@ def huber(y_true, y_pred, delta=1.0):
     where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss
 
     Args:
-      y_true: tensor of true targets.
-      y_pred: tensor of predicted targets.
-      delta: A float, the point where the Huber loss function changes from a
-        quadratic to linear.
+        y_true: tensor of true targets.
+        y_pred: tensor of predicted targets.
+        delta: A float, the point where the Huber loss function changes from a
+            quadratic to linear.
 
     Returns:
-      Tensor with one scalar loss entry per sample.
+        Tensor with one scalar loss entry per sample.
     """
     y_pred = tf.cast(y_pred, dtype=backend.floatx())
     y_true = tf.cast(y_true, dtype=backend.floatx())
@@ -2040,11 +2060,11 @@ def log_cosh(y_true, y_pred):
     ...     atol=1e-5)
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-      Logcosh error values. shape = `[batch_size, d0, .. dN-1]`.
+        Logcosh error values. shape = `[batch_size, d0, .. dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -2077,18 +2097,18 @@ def categorical_crossentropy(
     array([0.0513, 2.303], dtype=float32)
 
     Args:
-      y_true: Tensor of one-hot true targets.
-      y_pred: Tensor of predicted targets.
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
-        example, if `0.1`, use `0.1 / num_classes` for non-target labels
-        and `0.9 + 0.1 / num_classes` for target labels.
-      axis: Defaults to -1. The dimension along which the entropy is
-        computed.
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Defaults to -1. The dimension along which the entropy is
+            computed.
 
     Returns:
-      Categorical crossentropy loss value.
+        Categorical crossentropy loss value.
     """
     if isinstance(axis, bool):
         raise ValueError(
@@ -2131,18 +2151,18 @@ def _ragged_tensor_categorical_crossentropy(
     """Implements support for handling RaggedTensors.
 
     Args:
-      y_true: Tensor of one-hot true targets.
-      y_pred: Tensor of predicted targets.
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
-        example, if `0.1`, use `0.1 / num_classes` for non-target labels
-        and `0.9 + 0.1 / num_classes` for target labels.
-      axis: The axis along which to compute crossentropy (the features axis).
-          Defaults to -1.
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: The axis along which to compute crossentropy (the features axis).
+            Defaults to -1.
 
     Returns:
-      Categorical crossentropy loss value.
+        Categorical crossentropy loss value.
 
     Expected shape: (batch, sequence_len, n_classes) with sequence_len
     being variable per batch.
@@ -2337,19 +2357,20 @@ def sparse_categorical_crossentropy(
             [0.0000000e+00, 0.0000000e+00]]], dtype=float32)
 
     Args:
-      y_true: Ground truth values.
-      y_pred: The predicted values.
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      axis: Defaults to -1. The dimension along which the entropy is
-        computed.
-      ignore_class: Optional integer. The ID of a class to be ignored during
-        loss computation. This is useful, for example, in segmentation
-        problems featuring a "void" class (commonly -1 or 255) in segmentation
-        maps. By default (`ignore_class=None`), all classes are considered.
+        y_true: Ground truth values.
+        y_pred: The predicted values.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        axis: Defaults to -1. The dimension along which the entropy is
+            computed.
+        ignore_class: Optional integer. The ID of a class to be ignored during
+            loss computation. This is useful, for example, in segmentation
+            problems featuring a "void" class (commonly -1 or 255) in
+            segmentation maps. By default (`ignore_class=None`), all classes are
+            considered.
 
     Returns:
-      Sparse categorical crossentropy loss value.
+        Sparse categorical crossentropy loss value.
     """
     return backend.sparse_categorical_crossentropy(
         y_true,
@@ -2404,18 +2425,18 @@ def binary_crossentropy(
     array([0.916 , 0.714], dtype=float32)
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      label_smoothing: Float in [0, 1]. If > `0` then smooth the labels by
-        squeezing them towards 0.5 That is, using `1. - 0.5 * label_smoothing`
-        for the target class and `0.5 * label_smoothing` for the non-target
-        class.
-      axis: The axis along which the mean is computed. Defaults to -1.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels by
+            squeezing them towards 0.5 That is, using
+            `1. - 0.5 * label_smoothing` for the target class and
+            `0.5 * label_smoothing` for the non-target class.
+        axis: The axis along which the mean is computed. Defaults to -1.
 
     Returns:
-      Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
+        Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -2441,17 +2462,17 @@ def _ragged_tensor_binary_crossentropy(
     """Implements support for handling RaggedTensors.
 
     Args:
-      y_true: Tensor of one-hot true targets.
-      y_pred: Tensor of predicted targets.
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
-        example, if `0.1`, use `0.1 / num_classes` for non-target labels
-        and `0.9 + 0.1 / num_classes` for target labels.
-      axis: Axis along which to compute crossentropy.
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Axis along which to compute crossentropy.
 
     Returns:
-      Binary crossentropy loss value.
+        Binary crossentropy loss value.
 
     Expected shape: (batch, sequence_len) with sequence_len being variable
     per batch.
@@ -2514,24 +2535,25 @@ def binary_focal_crossentropy(
     array([0.330, 0.206], dtype=float32)
 
     Args:
-      y_true: Ground truth values, of shape `(batch_size, d0, .. dN)`.
-      y_pred: The predicted values, of shape `(batch_size, d0, .. dN)`.
-      apply_class_balancing: A bool, whether to apply weight balancing on the
-        binary classes 0 and 1.
-      alpha: A weight balancing factor for class 1, default is `0.25` as
-        mentioned in the reference. The weight for class 0 is `1.0 - alpha`.
-      gamma: A focusing parameter, default is `2.0` as mentioned in the
-        reference.
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      label_smoothing: Float in `[0, 1]`. If higher than 0 then smooth the
-        labels by squeezing them towards `0.5`, i.e., using `1. - 0.5 *
-        label_smoothing` for the target class and `0.5 * label_smoothing` for
-        the non-target class.
-      axis: The axis along which the mean is computed. Defaults to `-1`.
+        y_true: Ground truth values, of shape `(batch_size, d0, .. dN)`.
+        y_pred: The predicted values, of shape `(batch_size, d0, .. dN)`.
+        apply_class_balancing: A bool, whether to apply weight balancing on the
+            binary classes 0 and 1.
+        alpha: A weight balancing factor for class 1, default is `0.25` as
+            mentioned in the reference. The weight for class 0 is `1.0 - alpha`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in `[0, 1]`. If higher than 0 then smooth the
+            labels by squeezing them towards `0.5`, i.e., using `1. - 0.5 *
+            label_smoothing` for the target class and `0.5 * label_smoothing`
+            for the non-target class.
+        axis: The axis along which the mean is computed. Defaults to `-1`.
 
     Returns:
-      Binary focal crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
+        Binary focal crossentropy loss value.
+            shape = `[batch_size, d0, .. dN-1]`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -2579,25 +2601,25 @@ def _ragged_tensor_binary_focal_crossentropy(
     the number of batches.
 
     Args:
-      y_true: Tensor of one-hot true targets.
-      y_pred: Tensor of predicted targets.
-      apply_class_balancing: A bool, whether to apply weight balancing on the
-        binary classes 0 and 1.
-      alpha: A weight balancing factor for class 1, default is `0.25` as
-        mentioned in the reference [Lin et al., 2018](
-        https://arxiv.org/pdf/1708.02002.pdf). The weight for class 0 is
-        `1.0 - alpha`.
-      gamma: A focusing parameter, default is `2.0` as mentioned in the
-        reference.
-      from_logits: Whether `y_pred` is expected to be a logits tensor. By
-        default, we assume that `y_pred` encodes a probability distribution.
-      label_smoothing: Float in `[0, 1]`. If > `0` then smooth the labels. For
-        example, if `0.1`, use `0.1 / num_classes` for non-target labels
-        and `0.9 + 0.1 / num_classes` for target labels.
-      axis: Axis along which to compute crossentropy.
+        y_true: Tensor of one-hot true targets.
+        y_pred: Tensor of predicted targets.
+        apply_class_balancing: A bool, whether to apply weight balancing on the
+            binary classes 0 and 1.
+        alpha: A weight balancing factor for class 1, default is `0.25` as
+            mentioned in the reference [Lin et al., 2018](
+            https://arxiv.org/pdf/1708.02002.pdf). The weight for class 0 is
+            `1.0 - alpha`.
+        gamma: A focusing parameter, default is `2.0` as mentioned in the
+            reference.
+        from_logits: Whether `y_pred` is expected to be a logits tensor. By
+            default, we assume that `y_pred` encodes a probability distribution.
+        label_smoothing: Float in `[0, 1]`. If > `0` then smooth the labels. For
+            example, if `0.1`, use `0.1 / num_classes` for non-target labels
+            and `0.9 + 0.1 / num_classes` for target labels.
+        axis: Axis along which to compute crossentropy.
 
     Returns:
-      Binary focal crossentropy loss value.
+        Binary focal crossentropy loss value.
     """
     fn = functools.partial(
         binary_focal_crossentropy,
@@ -2641,14 +2663,14 @@ def kl_divergence(y_true, y_pred):
     ...     loss.numpy(), np.sum(y_true * np.log(y_true / y_pred), axis=-1))
 
     Args:
-      y_true: Tensor of true targets.
-      y_pred: Tensor of predicted targets.
+        y_true: Tensor of true targets.
+        y_pred: Tensor of predicted targets.
 
     Returns:
-      A `Tensor` with loss.
+        A `Tensor` with loss.
 
     Raises:
-      TypeError: If `y_true` cannot be cast to the `y_pred.dtype`.
+        TypeError: If `y_true` cannot be cast to the `y_pred.dtype`.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -2677,14 +2699,14 @@ def poisson(y_true, y_pred):
     ...     atol=1e-5)
 
     Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
 
     Returns:
-       Poisson loss value. shape = `[batch_size, d0, .. dN-1]`.
+        Poisson loss value. shape = `[batch_size, d0, .. dN-1]`.
 
     Raises:
-      InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
+        InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
     """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
@@ -2727,12 +2749,12 @@ def cosine_similarity(y_true, y_pred, axis=-1):
     array([-0., -0.999, 0.999], dtype=float32)
 
     Args:
-      y_true: Tensor of true targets.
-      y_pred: Tensor of predicted targets.
-      axis: Axis along which to determine similarity.
+        y_true: Tensor of true targets.
+        y_pred: Tensor of predicted targets.
+        axis: Axis along which to determine similarity.
 
     Returns:
-      Cosine similarity tensor.
+        Cosine similarity tensor.
     """
     y_true = tf.linalg.l2_normalize(y_true, axis=axis)
     y_pred = tf.linalg.l2_normalize(y_pred, axis=axis)
@@ -2791,18 +2813,18 @@ class CosineSimilarity(LossFunctionWrapper):
     ```
 
     Args:
-      axis: The axis along which the cosine similarity is computed
-        (the features axis). Defaults to -1.
-      reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
-        Default value is `AUTO`. `AUTO` indicates that the reduction option will
-        be determined by the usage context. For almost all cases this defaults
-        to `SUM_OVER_BATCH_SIZE`. When used under a
-        `tf.distribute.Strategy`, except via `Model.compile()` and
-        `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-        will raise an error. Please see this custom training [tutorial](
-        https://www.tensorflow.org/tutorials/distribute/custom_training)
-        for more details.
-      name: Optional name for the instance.
+        axis: The axis along which the cosine similarity is computed
+            (the features axis). Defaults to -1.
+        reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            Default value is `AUTO`. `AUTO` indicates that the reduction option
+            will be determined by the usage context. For almost all cases this
+            defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+            `tf.distribute.Strategy`, except via `Model.compile()` and
+            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE` will raise an
+            error. Please see this custom training [tutorial](
+            https://www.tensorflow.org/tutorials/distribute/custom_training)
+            for more details.
+        name: Optional name for the instance. Defaults to 'cosine_similarity'.
     """
 
     def __init__(
@@ -2849,10 +2871,12 @@ def serialize(loss, use_legacy_format=False):
     """Serializes loss function or `Loss` instance.
 
     Args:
-      loss: A Keras `Loss` instance or a loss function.
+        loss: A Keras `Loss` instance or a loss function.
+        use_legacy_format: Boolean, whether to use the legacy serialization
+            format.
 
     Returns:
-      Loss configuration dictionary.
+        Loss configuration dictionary.
     """
     if use_legacy_format:
         return legacy_serialization.serialize_keras_object(loss)
@@ -2866,8 +2890,10 @@ def deserialize(name, custom_objects=None, use_legacy_format=False):
     Args:
         name: Loss configuration.
         custom_objects: Optional dictionary mapping names (strings) to custom
-          objects (classes and functions) to be considered during
-          deserialization.
+            objects (classes and functions) to be considered during
+            deserialization.
+        use_legacy_format: Boolean, whether to use the legacy serialization
+            format.
 
     Returns:
         A Keras `Loss` instance or a loss function.
@@ -2911,15 +2937,15 @@ def get(identifier):
     <class '...keras.losses.CategoricalCrossentropy'>
 
     Args:
-      identifier: A loss identifier. One of None or string name of a loss
-        function/class or loss configuration dictionary or a loss function or a
-        loss class instance.
+        identifier: A loss identifier. One of None or string name of a loss
+            function/class or loss configuration dictionary or a loss function
+            or a loss class instance.
 
     Returns:
-      A Keras loss as a `function`/ `Loss` class instance.
+        A Keras loss as a `function`/ `Loss` class instance.
 
     Raises:
-      ValueError: If `identifier` cannot be interpreted.
+        ValueError: If `identifier` cannot be interpreted.
     """
     if identifier is None:
         return None

From 7c4ca54b99ba9c4f2c9d55b5127a180763986c09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kaan=20B=C4=B1=C3=A7akc=C4=B1?= <kaan.dvlpr@gmail.com>
Date: Sat, 22 Apr 2023 10:41:03 +0100
Subject: [PATCH 2/4] Move CosineSimilarity class to up top.

---
 keras/losses.py | 154 ++++++++++++++++++++++++------------------------
 1 file changed, 77 insertions(+), 77 deletions(-)

diff --git a/keras/losses.py b/keras/losses.py
index 609e30c5c6a..9a0c6a3254a 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -1157,6 +1157,83 @@ def __init__(
         )
 
 
+@keras_export("keras.losses.CosineSimilarity")
+class CosineSimilarity(LossFunctionWrapper):
+    """Computes the cosine similarity between labels and predictions.
+
+    Note that it is a number between -1 and 1. When it is a negative number
+    between -1 and 0, 0 indicates orthogonality and values closer to -1
+    indicate greater similarity. The values closer to 1 indicate greater
+    dissimilarity. This makes it usable as a loss function in a setting
+    where you try to maximize the proximity between predictions and targets.
+    If either `y_true` or `y_pred` is a zero vector, cosine similarity will be 0
+    regardless of the proximity between predictions and targets.
+
+    `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
+
+    Standalone usage:
+
+    >>> y_true = [[0., 1.], [1., 1.]]
+    >>> y_pred = [[1., 0.], [1., 1.]]
+    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
+    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
+    >>> # l2_norm(y_true) = [[0., 1.], [1./1.414, 1./1.414]]
+    >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414, 1./1.414]]
+    >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
+    >>> # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
+    >>> #       = -((0. + 0.) +  (0.5 + 0.5)) / 2
+    >>> cosine_loss(y_true, y_pred).numpy()
+    -0.5
+
+    >>> # Calling with 'sample_weight'.
+    >>> cosine_loss(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
+    -0.0999
+
+    >>> # Using 'sum' reduction type.
+    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
+    ...     reduction=tf.keras.losses.Reduction.SUM)
+    >>> cosine_loss(y_true, y_pred).numpy()
+    -0.999
+
+    >>> # Using 'none' reduction type.
+    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
+    ...     reduction=tf.keras.losses.Reduction.NONE)
+    >>> cosine_loss(y_true, y_pred).numpy()
+    array([-0., -0.999], dtype=float32)
+
+    Usage with the `compile()` API:
+
+    ```python
+    model.compile(optimizer='sgd',
+                  loss=tf.keras.losses.CosineSimilarity(axis=1))
+    ```
+
+    Args:
+        axis: The axis along which the cosine similarity is computed
+            (the features axis). Defaults to -1.
+        reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
+            Default value is `AUTO`. `AUTO` indicates that the reduction option
+            will be determined by the usage context. For almost all cases this
+            defaults to `SUM_OVER_BATCH_SIZE`. When used under a
+            `tf.distribute.Strategy`, except via `Model.compile()` and
+            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE` will raise an
+            error. Please see this custom training [tutorial](
+            https://www.tensorflow.org/tutorials/distribute/custom_training)
+            for more details.
+        name: Optional name for the instance. Defaults to 'cosine_similarity'.
+    """
+
+    def __init__(
+        self,
+        axis=-1,
+        reduction=losses_utils.ReductionV2.AUTO,
+        name="cosine_similarity",
+    ):
+        super().__init__(
+            cosine_similarity, reduction=reduction, name=name, axis=axis
+        )
+
+
 @keras_export("keras.losses.Hinge")
 class Hinge(LossFunctionWrapper):
     """Computes the hinge loss between `y_true` & `y_pred`.
@@ -2761,83 +2838,6 @@ def cosine_similarity(y_true, y_pred, axis=-1):
     return -tf.reduce_sum(y_true * y_pred, axis=axis)
 
 
-@keras_export("keras.losses.CosineSimilarity")
-class CosineSimilarity(LossFunctionWrapper):
-    """Computes the cosine similarity between labels and predictions.
-
-    Note that it is a number between -1 and 1. When it is a negative number
-    between -1 and 0, 0 indicates orthogonality and values closer to -1
-    indicate greater similarity. The values closer to 1 indicate greater
-    dissimilarity. This makes it usable as a loss function in a setting
-    where you try to maximize the proximity between predictions and targets.
-    If either `y_true` or `y_pred` is a zero vector, cosine similarity will be 0
-    regardless of the proximity between predictions and targets.
-
-    `loss = -sum(l2_norm(y_true) * l2_norm(y_pred))`
-
-    Standalone usage:
-
-    >>> y_true = [[0., 1.], [1., 1.]]
-    >>> y_pred = [[1., 0.], [1., 1.]]
-    >>> # Using 'auto'/'sum_over_batch_size' reduction type.
-    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1)
-    >>> # l2_norm(y_true) = [[0., 1.], [1./1.414, 1./1.414]]
-    >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414, 1./1.414]]
-    >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]]
-    >>> # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1))
-    >>> #       = -((0. + 0.) +  (0.5 + 0.5)) / 2
-    >>> cosine_loss(y_true, y_pred).numpy()
-    -0.5
-
-    >>> # Calling with 'sample_weight'.
-    >>> cosine_loss(y_true, y_pred, sample_weight=[0.8, 0.2]).numpy()
-    -0.0999
-
-    >>> # Using 'sum' reduction type.
-    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
-    ...     reduction=tf.keras.losses.Reduction.SUM)
-    >>> cosine_loss(y_true, y_pred).numpy()
-    -0.999
-
-    >>> # Using 'none' reduction type.
-    >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1,
-    ...     reduction=tf.keras.losses.Reduction.NONE)
-    >>> cosine_loss(y_true, y_pred).numpy()
-    array([-0., -0.999], dtype=float32)
-
-    Usage with the `compile()` API:
-
-    ```python
-    model.compile(optimizer='sgd',
-                  loss=tf.keras.losses.CosineSimilarity(axis=1))
-    ```
-
-    Args:
-        axis: The axis along which the cosine similarity is computed
-            (the features axis). Defaults to -1.
-        reduction: Type of `tf.keras.losses.Reduction` to apply to loss.
-            Default value is `AUTO`. `AUTO` indicates that the reduction option
-            will be determined by the usage context. For almost all cases this
-            defaults to `SUM_OVER_BATCH_SIZE`. When used under a
-            `tf.distribute.Strategy`, except via `Model.compile()` and
-            `Model.fit()`, using `AUTO` or `SUM_OVER_BATCH_SIZE` will raise an
-            error. Please see this custom training [tutorial](
-            https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-        name: Optional name for the instance. Defaults to 'cosine_similarity'.
-    """
-
-    def __init__(
-        self,
-        axis=-1,
-        reduction=losses_utils.ReductionV2.AUTO,
-        name="cosine_similarity",
-    ):
-        super().__init__(
-            cosine_similarity, reduction=reduction, name=name, axis=axis
-        )
-
-
 # Aliases.
 
 bce = BCE = binary_crossentropy

From 643b1ae8d049ffb459205b415e9ff8430d488a3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kaan=20B=C4=B1=C3=A7akc=C4=B1?= <kaan.dvlpr@gmail.com>
Date: Sat, 22 Apr 2023 10:43:19 +0100
Subject: [PATCH 3/4] Fix formatting

---
 keras/losses.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/keras/losses.py b/keras/losses.py
index 9a0c6a3254a..0147f2a1e95 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -634,9 +634,9 @@ def __init__(
 
         Args:
             from_logits: Whether to interpret `y_pred` as a tensor of
-                [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we
-                assume that `y_pred` contains probabilities (i.e., values in [0,
-                1]).
+                [logit](https://en.wikipedia.org/wiki/Logit) values. By default,
+                we assume that `y_pred` contains probabilities (i.e., values in
+                [0, 1]).
             label_smoothing: Float in [0, 1]. When 0, no smoothing occurs.
                 When > 0, we compute the loss between the predicted labels and a
                 smoothed version of the true labels, where the smoothing

From 8addc40cccb9758c062c8c5fff5904301fbf8036 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kaan=20B=C4=B1=C3=A7akc=C4=B1?=
 <46622558+Frightera@users.noreply.github.com>
Date: Thu, 27 Apr 2023 18:49:05 +0100
Subject: [PATCH 4/4] Add Defaults to `False` for use_legacy_format param.

---
 keras/losses.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras/losses.py b/keras/losses.py
index 0147f2a1e95..8ed1cfa65dc 100644
--- a/keras/losses.py
+++ b/keras/losses.py
@@ -2873,7 +2873,7 @@ def serialize(loss, use_legacy_format=False):
     Args:
         loss: A Keras `Loss` instance or a loss function.
         use_legacy_format: Boolean, whether to use the legacy serialization
-            format.
+            format. Defaults to `False`.
 
     Returns:
         Loss configuration dictionary.
@@ -2893,7 +2893,7 @@ def deserialize(name, custom_objects=None, use_legacy_format=False):
             objects (classes and functions) to be considered during
             deserialization.
         use_legacy_format: Boolean, whether to use the legacy serialization
-            format.
+            format. Defaults to `False`.
 
     Returns:
         A Keras `Loss` instance or a loss function.