From 1a57052c4e234a32e05cf7fea0ed4956f1ae8439 Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Wed, 12 Apr 2023 20:36:47 -0400
Subject: [PATCH 1/2] 
 [keras/optimizers/legacy/adadelta.py,keras/optimizers/legacy/adagrad.py,keras/optimizers/legacy/adam.py,keras/optimizers/legacy/ftrl.py,keras/optimizers/legacy/gradient_descent.py,keras/optimizers/legacy/optimizer_v2.py,keras/optimizers/legacy/rmsprop.py,keras/optimizers/legacy_learning_rate_decay.py]
 Standardise docstring usage of "Default to"

---
 keras/optimizers/legacy/adadelta.py            |  2 +-
 keras/optimizers/legacy/adagrad.py             |  2 +-
 keras/optimizers/legacy/adam.py                | 17 +++++++++--------
 keras/optimizers/legacy/ftrl.py                |  6 +++---
 keras/optimizers/legacy/gradient_descent.py    |  6 +++---
 keras/optimizers/legacy/optimizer_v2.py        |  4 ++--
 keras/optimizers/legacy/rmsprop.py             | 15 ++++++++-------
 keras/optimizers/legacy_learning_rate_decay.py |  9 +++++----
 8 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/keras/optimizers/legacy/adadelta.py b/keras/optimizers/legacy/adadelta.py
index 4b8b1680e2f..9310a9bfcfd 100644
--- a/keras/optimizers/legacy/adadelta.py
+++ b/keras/optimizers/legacy/adadelta.py
@@ -48,10 +48,10 @@ class Adadelta(optimizer_v2.OptimizerV2):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
         Note that `Adadelta` tends to benefit from higher initial learning rate
         values compared to other optimizers.
         To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       rho: A `Tensor` or a floating point value. The decay rate.
       epsilon: Small floating point value used to maintain numerical stability.
       name: Optional name prefix for the operations created when applying
diff --git a/keras/optimizers/legacy/adagrad.py b/keras/optimizers/legacy/adagrad.py
index c29280c8690..4b130051416 100644
--- a/keras/optimizers/legacy/adagrad.py
+++ b/keras/optimizers/legacy/adagrad.py
@@ -40,10 +40,10 @@ class Adagrad(optimizer_v2.OptimizerV2):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
         Note that `Adagrad` tends to benefit from higher initial learning rate
         values compared to other optimizers.
         To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       initial_accumulator_value: Floating point value.
         Starting value for the accumulators (per-parameter momentum values).
         Must be non-negative.
diff --git a/keras/optimizers/legacy/adam.py b/keras/optimizers/legacy/adam.py
index a416d22f10b..3678f316de8 100644
--- a/keras/optimizers/legacy/adam.py
+++ b/keras/optimizers/legacy/adam.py
@@ -44,17 +44,18 @@ class Adam(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use, The
-        learning rate. Defaults to 0.001.
+        learning rate. Defaults to `0.001`.
       beta_1: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
+        exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
       beta_2: A float value or a constant float tensor, or a callable
         that takes no arguments and returns the actual value to use, The
-        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
+        exponential decay rate for the 2nd moment estimates. Defaults to
+        `0.999`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
         the paper "On the Convergence of Adam and beyond". Defaults to `False`.
       name: Optional name for the operations created when applying gradients.
@@ -364,19 +365,19 @@ def __init__(
           learning_rate: A `Tensor`, floating point value, or a schedule that is
             a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
             callable that takes no arguments and returns the actual value to
-            use, The learning rate. Defaults to 0.001.
+            use, The learning rate. Defaults to `0.001`.
           beta_1: A float value or a constant float tensor, or a callable that
             takes no arguments and returns the actual value to use. The
             exponential decay rate for the 1st moment estimates. Defaults to
-            0.9.
+            `0.9`.
           beta_2: A float value or a constant float tensor, or a callable that
             takes no arguments and returns the actual value to use, The
             exponential decay rate for the 2nd moment estimates. Defaults to
-            0.999.
+            `0.999`.
           epsilon: A small constant for numerical stability. This epsilon is
             "epsilon hat" in the Kingma and Ba paper (in the formula just before
             Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults
-            to 1e-7.
+            to `1e-7`.
           amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
             from the paper "On the Convergence of Adam and beyond". Defaults to
             `False`.
diff --git a/keras/optimizers/legacy/ftrl.py b/keras/optimizers/legacy/ftrl.py
index d41536ecaf1..0e592b26874 100644
--- a/keras/optimizers/legacy/ftrl.py
+++ b/keras/optimizers/legacy/ftrl.py
@@ -81,9 +81,9 @@ class Ftrl(optimizer_v2.OptimizerV2):
       initial_accumulator_value: The starting value for accumulators.
         Only zero or positive values are allowed.
       l1_regularization_strength: A float value, must be greater than or
-        equal to zero. Defaults to 0.0.
+        equal to zero. Defaults to `0.0`.
       l2_regularization_strength: A float value, must be greater than or
-        equal to zero. Defaults to 0.0.
+        equal to zero. Defaults to `0.0`.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to `"Ftrl"`.
       l2_shrinkage_regularization_strength: A float value, must be greater than
@@ -91,7 +91,7 @@ class Ftrl(optimizer_v2.OptimizerV2):
         stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
         When input is sparse shrinkage will only happen on the active weights.
       beta: A float value, representing the beta value from the paper.
-        Defaults to 0.0.
+        Defaults to `0.0`.
       **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
         `clipnorm`, `global_clipnorm`.
         If `clipvalue` (float) is set, the gradient of each weight
diff --git a/keras/optimizers/legacy/gradient_descent.py b/keras/optimizers/legacy/gradient_descent.py
index 0bcb10fdfec..8d305f705e6 100644
--- a/keras/optimizers/legacy/gradient_descent.py
+++ b/keras/optimizers/legacy/gradient_descent.py
@@ -54,10 +54,10 @@ class SGD(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.01.
+        learning rate. Defaults to `0.01`.
       momentum: float hyperparameter >= 0 that accelerates gradient descent in
-        the relevant direction and dampens oscillations. Defaults to 0, i.e.,
-        vanilla gradient descent.
+        the relevant direction and dampens oscillations. Vanilla gradient
+        descent means no momentum. Defaults to `0.`.
       nesterov: boolean. Whether to apply Nesterov momentum.
         Defaults to `False`.
       name: Optional name prefix for the operations created when applying
diff --git a/keras/optimizers/legacy/optimizer_v2.py b/keras/optimizers/legacy/optimizer_v2.py
index 7deacfad20e..ca56b07cfaa 100644
--- a/keras/optimizers/legacy/optimizer_v2.py
+++ b/keras/optimizers/legacy/optimizer_v2.py
@@ -692,8 +692,8 @@ def apply_gradients(
 
         Args:
           grads_and_vars: List of (gradient, variable) pairs.
-          name: Optional name for the returned operation. Default to the name
-            passed to the `Optimizer` constructor.
+          name: Optional name for the returned operation. When None, uses the
+            name passed to the `Optimizer` constructor. Defaults to `None`.
           experimental_aggregate_gradients: Whether to sum gradients from
             different replicas in the presence of `tf.distribute.Strategy`. If
             False, it's user responsibility to aggregate the gradients. Default
diff --git a/keras/optimizers/legacy/rmsprop.py b/keras/optimizers/legacy/rmsprop.py
index 626c333398d..5537de9cc8a 100644
--- a/keras/optimizers/legacy/rmsprop.py
+++ b/keras/optimizers/legacy/rmsprop.py
@@ -45,13 +45,14 @@ class RMSprop(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
-      rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
-      momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
+        learning rate. Defaults to `0.001`.
+      rho: Discounting factor for the history/coming gradient. Defaults to
+        `0.9`.
+      momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       centered: Boolean. If `True`, gradients are normalized by the estimated
         variance of the gradient; if False, by the uncentered second moment.
         Setting this to `True` may help with training, but is slightly more
@@ -111,10 +112,10 @@ def __init__(
           learning_rate: A `Tensor`, floating point value, or a schedule that is
             a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
             callable that takes no arguments and returns the actual value to
-            use. The learning rate. Defaults to 0.001.
+            use. The learning rate. Defaults to `0.001`.
           rho: Discounting factor for the history/coming gradient. Defaults to
-            0.9.
-          momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
+            `0.9`.
+          momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
           epsilon: A small constant for numerical stability. This epsilon is
             "epsilon hat" in the Kingma and Ba paper (in the formula just before
             Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults
diff --git a/keras/optimizers/legacy_learning_rate_decay.py b/keras/optimizers/legacy_learning_rate_decay.py
index a75a43e0372..93bd9dabd1a 100644
--- a/keras/optimizers/legacy_learning_rate_decay.py
+++ b/keras/optimizers/legacy_learning_rate_decay.py
@@ -79,7 +79,7 @@ def exponential_decay(
         The decay rate.
       staircase: Boolean. If `True` decay the learning rate at discrete
         intervals
-      name: String. Optional name of the operation.  Defaults to
+      name: String. Optional name of the operation. Defaults to
         'ExponentialDecay'.
 
     Returns:
@@ -264,9 +264,10 @@ def polynomial_decay(
       end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
         number.  The minimal end learning rate.
       power: A scalar `float32` or `float64` `Tensor` or a Python number.  The
-        power of the polynomial. Defaults to linear, 1.0.
-      cycle: A boolean, whether or not it should cycle beyond decay_steps.
-      name: String.  Optional name of the operation. Defaults to
+        power of the polynomial. Linear is default. Defaults to `1.0`.
+      cycle: A boolean, whether it should cycle beyond decay_steps. Defaults to
+        `False`.
+      name: String. Optional name of the operation. Defaults to
         'PolynomialDecay'.
 
     Returns:

From 0ebca04adac6604cb12babf2c5b5ef7ac95ba0ea Mon Sep 17 00:00:00 2001
From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com>
Date: Wed, 19 Apr 2023 23:04:18 -0400
Subject: [PATCH 2/2] [keras/optimizers/legacy/optimizer_v2.py] Backtick
 keywords in docstring ; [keras/optimizers/legacy_learning_rate_decay.py]
 Remove "Linear default"

---
 keras/optimizers/legacy/optimizer_v2.py        | 4 ++--
 keras/optimizers/legacy_learning_rate_decay.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/keras/optimizers/legacy/optimizer_v2.py b/keras/optimizers/legacy/optimizer_v2.py
index ca56b07cfaa..984d721f0b3 100644
--- a/keras/optimizers/legacy/optimizer_v2.py
+++ b/keras/optimizers/legacy/optimizer_v2.py
@@ -692,12 +692,12 @@ def apply_gradients(
 
         Args:
           grads_and_vars: List of (gradient, variable) pairs.
-          name: Optional name for the returned operation. When None, uses the
+          name: Optional name for the returned operation. When `None`, uses the
             name passed to the `Optimizer` constructor. Defaults to `None`.
           experimental_aggregate_gradients: Whether to sum gradients from
             different replicas in the presence of `tf.distribute.Strategy`. If
             False, it's user responsibility to aggregate the gradients. Default
-            to True.
+            to `True`.
 
         Returns:
           An `Operation` that applies the specified gradients. The `iterations`
diff --git a/keras/optimizers/legacy_learning_rate_decay.py b/keras/optimizers/legacy_learning_rate_decay.py
index 93bd9dabd1a..8d8c217cecd 100644
--- a/keras/optimizers/legacy_learning_rate_decay.py
+++ b/keras/optimizers/legacy_learning_rate_decay.py
@@ -264,7 +264,7 @@ def polynomial_decay(
       end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
         number.  The minimal end learning rate.
       power: A scalar `float32` or `float64` `Tensor` or a Python number.  The
-        power of the polynomial. Linear is default. Defaults to `1.0`.
+        power of the polynomial. Defaults to `1.0`.
       cycle: A boolean, whether it should cycle beyond decay_steps. Defaults to
         `False`.
       name: String. Optional name of the operation. Defaults to