Merge pull request #17975 from SamuelMarks:keras.optimizers.legacy-de…

…faults-to PiperOrigin-RevId: 526072785
keras-team · Apr 21, 2023 · 5c25497 · 5c25497
2 parents 64d50c4 + 0ebca04
commit 5c25497
Show file tree

Hide file tree

Showing 6 changed files with 21 additions and 19 deletions.
diff --git a/keras/optimizers/legacy/adadelta.py b/keras/optimizers/legacy/adadelta.py
@@ -48,10 +48,10 @@ class Adadelta(optimizer_v2.OptimizerV2):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
         Note that `Adadelta` tends to benefit from higher initial learning rate
         values compared to other optimizers.
         To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       rho: A `Tensor` or a floating point value. The decay rate.
       epsilon: Small floating point value used to maintain numerical stability.
       name: Optional name prefix for the operations created when applying

diff --git a/keras/optimizers/legacy/adagrad.py b/keras/optimizers/legacy/adagrad.py
@@ -40,10 +40,10 @@ class Adagrad(optimizer_v2.OptimizerV2):
       learning_rate: Initial value for the learning rate:
         either a floating point value,
         or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
         Note that `Adagrad` tends to benefit from higher initial learning rate
         values compared to other optimizers.
         To match the exact form in the original paper, use 1.0.
+        Defaults to `0.001`.
       initial_accumulator_value: Floating point value.
         Starting value for the accumulators (per-parameter momentum values).
         Must be non-negative.

diff --git a/keras/optimizers/legacy/gradient_descent.py b/keras/optimizers/legacy/gradient_descent.py
@@ -54,10 +54,10 @@ class SGD(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.01.
+        learning rate. Defaults to `0.01`.
       momentum: float hyperparameter >= 0 that accelerates gradient descent in
-        the relevant direction and dampens oscillations. Defaults to 0, i.e.,
-        vanilla gradient descent.
+        the relevant direction and dampens oscillations. Vanilla gradient
+        descent means no momentum. Defaults to `0.`.
       nesterov: boolean. Whether to apply Nesterov momentum.
         Defaults to `False`.
       name: Optional name prefix for the operations created when applying

diff --git a/keras/optimizers/legacy/optimizer_v2.py b/keras/optimizers/legacy/optimizer_v2.py
@@ -692,12 +692,12 @@ def apply_gradients(
 
         Args:
           grads_and_vars: List of (gradient, variable) pairs.
-          name: Optional name for the returned operation. Default to the name
-            passed to the `Optimizer` constructor.
+          name: Optional name for the returned operation. When `None`, uses the
+            name passed to the `Optimizer` constructor. Defaults to `None`.
           experimental_aggregate_gradients: Whether to sum gradients from
             different replicas in the presence of `tf.distribute.Strategy`. If
             False, it's user responsibility to aggregate the gradients. Default
-            to True.
+            to `True`.
 
         Returns:
           An `Operation` that applies the specified gradients. The `iterations`

diff --git a/keras/optimizers/legacy/rmsprop.py b/keras/optimizers/legacy/rmsprop.py
@@ -45,13 +45,14 @@ class RMSprop(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
-      rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
-      momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
+        learning rate. Defaults to `0.001`.
+      rho: Discounting factor for the history/coming gradient. Defaults to
+        `0.9`.
+      momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
+        `1e-7`.
       centered: Boolean. If `True`, gradients are normalized by the estimated
         variance of the gradient; if False, by the uncentered second moment.
         Setting this to `True` may help with training, but is slightly more
@@ -111,10 +112,10 @@ def __init__(
           learning_rate: A `Tensor`, floating point value, or a schedule that is
             a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
             callable that takes no arguments and returns the actual value to
-            use. The learning rate. Defaults to 0.001.
+            use. The learning rate. Defaults to `0.001`.
           rho: Discounting factor for the history/coming gradient. Defaults to
-            0.9.
-          momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
+            `0.9`.
+          momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`.
           epsilon: A small constant for numerical stability. This epsilon is
             "epsilon hat" in the Kingma and Ba paper (in the formula just before
             Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults

diff --git a/keras/optimizers/legacy_learning_rate_decay.py b/keras/optimizers/legacy_learning_rate_decay.py
@@ -79,7 +79,7 @@ def exponential_decay(
         The decay rate.
       staircase: Boolean. If `True` decay the learning rate at discrete
         intervals
-      name: String. Optional name of the operation.  Defaults to
+      name: String. Optional name of the operation. Defaults to
         'ExponentialDecay'.
 
     Returns:
@@ -264,9 +264,10 @@ def polynomial_decay(
       end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python
         number.  The minimal end learning rate.
       power: A scalar `float32` or `float64` `Tensor` or a Python number.  The
-        power of the polynomial. Defaults to linear, 1.0.
-      cycle: A boolean, whether or not it should cycle beyond decay_steps.
-      name: String.  Optional name of the operation. Defaults to
+        power of the polynomial. Defaults to `1.0`.
+      cycle: A boolean, whether it should cycle beyond decay_steps. Defaults to
+        `False`.
+      name: String. Optional name of the operation. Defaults to
         'PolynomialDecay'.
 
     Returns: