From 1a57052c4e234a32e05cf7fea0ed4956f1ae8439 Mon Sep 17 00:00:00 2001 From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com> Date: Wed, 12 Apr 2023 20:36:47 -0400 Subject: [PATCH 1/2] [keras/optimizers/legacy/adadelta.py,keras/optimizers/legacy/adagrad.py,keras/optimizers/legacy/adam.py,keras/optimizers/legacy/ftrl.py,keras/optimizers/legacy/gradient_descent.py,keras/optimizers/legacy/optimizer_v2.py,keras/optimizers/legacy/rmsprop.py,keras/optimizers/legacy_learning_rate_decay.py] Standardise docstring usage of "Default to" --- keras/optimizers/legacy/adadelta.py | 2 +- keras/optimizers/legacy/adagrad.py | 2 +- keras/optimizers/legacy/adam.py | 17 +++++++++-------- keras/optimizers/legacy/ftrl.py | 6 +++--- keras/optimizers/legacy/gradient_descent.py | 6 +++--- keras/optimizers/legacy/optimizer_v2.py | 4 ++-- keras/optimizers/legacy/rmsprop.py | 15 ++++++++------- keras/optimizers/legacy_learning_rate_decay.py | 9 +++++---- 8 files changed, 32 insertions(+), 29 deletions(-) diff --git a/keras/optimizers/legacy/adadelta.py b/keras/optimizers/legacy/adadelta.py index 4b8b1680e2f..9310a9bfcfd 100644 --- a/keras/optimizers/legacy/adadelta.py +++ b/keras/optimizers/legacy/adadelta.py @@ -48,10 +48,10 @@ class Adadelta(optimizer_v2.OptimizerV2): learning_rate: Initial value for the learning rate: either a floating point value, or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance. - Defaults to 0.001. Note that `Adadelta` tends to benefit from higher initial learning rate values compared to other optimizers. To match the exact form in the original paper, use 1.0. + Defaults to `0.001`. rho: A `Tensor` or a floating point value. The decay rate. epsilon: Small floating point value used to maintain numerical stability. name: Optional name prefix for the operations created when applying diff --git a/keras/optimizers/legacy/adagrad.py b/keras/optimizers/legacy/adagrad.py index c29280c8690..4b130051416 100644 --- a/keras/optimizers/legacy/adagrad.py +++ b/keras/optimizers/legacy/adagrad.py @@ -40,10 +40,10 @@ class Adagrad(optimizer_v2.OptimizerV2): learning_rate: Initial value for the learning rate: either a floating point value, or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance. - Defaults to 0.001. Note that `Adagrad` tends to benefit from higher initial learning rate values compared to other optimizers. To match the exact form in the original paper, use 1.0. + Defaults to `0.001`. initial_accumulator_value: Floating point value. Starting value for the accumulators (per-parameter momentum values). Must be non-negative. diff --git a/keras/optimizers/legacy/adam.py b/keras/optimizers/legacy/adam.py index a416d22f10b..3678f316de8 100644 --- a/keras/optimizers/legacy/adam.py +++ b/keras/optimizers/legacy/adam.py @@ -44,17 +44,18 @@ class Adam(optimizer_v2.OptimizerV2): learning_rate: A `Tensor`, floating point value, or a schedule that is a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that takes no arguments and returns the actual value to use, The - learning rate. Defaults to 0.001. + learning rate. Defaults to `0.001`. beta_1: A float value or a constant float tensor, or a callable that takes no arguments and returns the actual value to use. The - exponential decay rate for the 1st moment estimates. Defaults to 0.9. + exponential decay rate for the 1st moment estimates. Defaults to `0.9`. beta_2: A float value or a constant float tensor, or a callable that takes no arguments and returns the actual value to use, The - exponential decay rate for the 2nd moment estimates. Defaults to 0.999. + exponential decay rate for the 2nd moment estimates. Defaults to + `0.999`. epsilon: A small constant for numerical stability. This epsilon is "epsilon hat" in the Kingma and Ba paper (in the formula just before Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to - 1e-7. + `1e-7`. amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from the paper "On the Convergence of Adam and beyond". Defaults to `False`. name: Optional name for the operations created when applying gradients. @@ -364,19 +365,19 @@ def __init__( learning_rate: A `Tensor`, floating point value, or a schedule that is a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that takes no arguments and returns the actual value to - use, The learning rate. Defaults to 0.001. + use, The learning rate. Defaults to `0.001`. beta_1: A float value or a constant float tensor, or a callable that takes no arguments and returns the actual value to use. The exponential decay rate for the 1st moment estimates. Defaults to - 0.9. + `0.9`. beta_2: A float value or a constant float tensor, or a callable that takes no arguments and returns the actual value to use, The exponential decay rate for the 2nd moment estimates. Defaults to - 0.999. + `0.999`. epsilon: A small constant for numerical stability. This epsilon is "epsilon hat" in the Kingma and Ba paper (in the formula just before Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults - to 1e-7. + to `1e-7`. amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from the paper "On the Convergence of Adam and beyond". Defaults to `False`. diff --git a/keras/optimizers/legacy/ftrl.py b/keras/optimizers/legacy/ftrl.py index d41536ecaf1..0e592b26874 100644 --- a/keras/optimizers/legacy/ftrl.py +++ b/keras/optimizers/legacy/ftrl.py @@ -81,9 +81,9 @@ class Ftrl(optimizer_v2.OptimizerV2): initial_accumulator_value: The starting value for accumulators. Only zero or positive values are allowed. l1_regularization_strength: A float value, must be greater than or - equal to zero. Defaults to 0.0. + equal to zero. Defaults to `0.0`. l2_regularization_strength: A float value, must be greater than or - equal to zero. Defaults to 0.0. + equal to zero. Defaults to `0.0`. name: Optional name prefix for the operations created when applying gradients. Defaults to `"Ftrl"`. l2_shrinkage_regularization_strength: A float value, must be greater than @@ -91,7 +91,7 @@ class Ftrl(optimizer_v2.OptimizerV2): stabilization penalty, whereas this L2 shrinkage is a magnitude penalty. When input is sparse shrinkage will only happen on the active weights. beta: A float value, representing the beta value from the paper. - Defaults to 0.0. + Defaults to `0.0`. **kwargs: keyword arguments. Allowed arguments are `clipvalue`, `clipnorm`, `global_clipnorm`. If `clipvalue` (float) is set, the gradient of each weight diff --git a/keras/optimizers/legacy/gradient_descent.py b/keras/optimizers/legacy/gradient_descent.py index 0bcb10fdfec..8d305f705e6 100644 --- a/keras/optimizers/legacy/gradient_descent.py +++ b/keras/optimizers/legacy/gradient_descent.py @@ -54,10 +54,10 @@ class SGD(optimizer_v2.OptimizerV2): learning_rate: A `Tensor`, floating point value, or a schedule that is a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that takes no arguments and returns the actual value to use. The - learning rate. Defaults to 0.01. + learning rate. Defaults to `0.01`. momentum: float hyperparameter >= 0 that accelerates gradient descent in - the relevant direction and dampens oscillations. Defaults to 0, i.e., - vanilla gradient descent. + the relevant direction and dampens oscillations. Vanilla gradient + descent means no momentum. Defaults to `0.`. nesterov: boolean. Whether to apply Nesterov momentum. Defaults to `False`. name: Optional name prefix for the operations created when applying diff --git a/keras/optimizers/legacy/optimizer_v2.py b/keras/optimizers/legacy/optimizer_v2.py index 7deacfad20e..ca56b07cfaa 100644 --- a/keras/optimizers/legacy/optimizer_v2.py +++ b/keras/optimizers/legacy/optimizer_v2.py @@ -692,8 +692,8 @@ def apply_gradients( Args: grads_and_vars: List of (gradient, variable) pairs. - name: Optional name for the returned operation. Default to the name - passed to the `Optimizer` constructor. + name: Optional name for the returned operation. When None, uses the + name passed to the `Optimizer` constructor. Defaults to `None`. experimental_aggregate_gradients: Whether to sum gradients from different replicas in the presence of `tf.distribute.Strategy`. If False, it's user responsibility to aggregate the gradients. Default diff --git a/keras/optimizers/legacy/rmsprop.py b/keras/optimizers/legacy/rmsprop.py index 626c333398d..5537de9cc8a 100644 --- a/keras/optimizers/legacy/rmsprop.py +++ b/keras/optimizers/legacy/rmsprop.py @@ -45,13 +45,14 @@ class RMSprop(optimizer_v2.OptimizerV2): learning_rate: A `Tensor`, floating point value, or a schedule that is a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that takes no arguments and returns the actual value to use. The - learning rate. Defaults to 0.001. - rho: Discounting factor for the history/coming gradient. Defaults to 0.9. - momentum: A scalar or a scalar `Tensor`. Defaults to 0.0. + learning rate. Defaults to `0.001`. + rho: Discounting factor for the history/coming gradient. Defaults to + `0.9`. + momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`. epsilon: A small constant for numerical stability. This epsilon is "epsilon hat" in the Kingma and Ba paper (in the formula just before Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to - 1e-7. + `1e-7`. centered: Boolean. If `True`, gradients are normalized by the estimated variance of the gradient; if False, by the uncentered second moment. Setting this to `True` may help with training, but is slightly more @@ -111,10 +112,10 @@ def __init__( learning_rate: A `Tensor`, floating point value, or a schedule that is a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that takes no arguments and returns the actual value to - use. The learning rate. Defaults to 0.001. + use. The learning rate. Defaults to `0.001`. rho: Discounting factor for the history/coming gradient. Defaults to - 0.9. - momentum: A scalar or a scalar `Tensor`. Defaults to 0.0. + `0.9`. + momentum: A scalar or a scalar `Tensor`. Defaults to `0.0`. epsilon: A small constant for numerical stability. This epsilon is "epsilon hat" in the Kingma and Ba paper (in the formula just before Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults diff --git a/keras/optimizers/legacy_learning_rate_decay.py b/keras/optimizers/legacy_learning_rate_decay.py index a75a43e0372..93bd9dabd1a 100644 --- a/keras/optimizers/legacy_learning_rate_decay.py +++ b/keras/optimizers/legacy_learning_rate_decay.py @@ -79,7 +79,7 @@ def exponential_decay( The decay rate. staircase: Boolean. If `True` decay the learning rate at discrete intervals - name: String. Optional name of the operation. Defaults to + name: String. Optional name of the operation. Defaults to 'ExponentialDecay'. Returns: @@ -264,9 +264,10 @@ def polynomial_decay( end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The minimal end learning rate. power: A scalar `float32` or `float64` `Tensor` or a Python number. The - power of the polynomial. Defaults to linear, 1.0. - cycle: A boolean, whether or not it should cycle beyond decay_steps. - name: String. Optional name of the operation. Defaults to + power of the polynomial. Linear is default. Defaults to `1.0`. + cycle: A boolean, whether it should cycle beyond decay_steps. Defaults to + `False`. + name: String. Optional name of the operation. Defaults to 'PolynomialDecay'. Returns: From 0ebca04adac6604cb12babf2c5b5ef7ac95ba0ea Mon Sep 17 00:00:00 2001 From: Samuel Marks <807580+SamuelMarks@users.noreply.github.com> Date: Wed, 19 Apr 2023 23:04:18 -0400 Subject: [PATCH 2/2] [keras/optimizers/legacy/optimizer_v2.py] Backtick keywords in docstring ; [keras/optimizers/legacy_learning_rate_decay.py] Remove "Linear default" --- keras/optimizers/legacy/optimizer_v2.py | 4 ++-- keras/optimizers/legacy_learning_rate_decay.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/keras/optimizers/legacy/optimizer_v2.py b/keras/optimizers/legacy/optimizer_v2.py index ca56b07cfaa..984d721f0b3 100644 --- a/keras/optimizers/legacy/optimizer_v2.py +++ b/keras/optimizers/legacy/optimizer_v2.py @@ -692,12 +692,12 @@ def apply_gradients( Args: grads_and_vars: List of (gradient, variable) pairs. - name: Optional name for the returned operation. When None, uses the + name: Optional name for the returned operation. When `None`, uses the name passed to the `Optimizer` constructor. Defaults to `None`. experimental_aggregate_gradients: Whether to sum gradients from different replicas in the presence of `tf.distribute.Strategy`. If False, it's user responsibility to aggregate the gradients. Default - to True. + to `True`. Returns: An `Operation` that applies the specified gradients. The `iterations` diff --git a/keras/optimizers/legacy_learning_rate_decay.py b/keras/optimizers/legacy_learning_rate_decay.py index 93bd9dabd1a..8d8c217cecd 100644 --- a/keras/optimizers/legacy_learning_rate_decay.py +++ b/keras/optimizers/legacy_learning_rate_decay.py @@ -264,7 +264,7 @@ def polynomial_decay( end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The minimal end learning rate. power: A scalar `float32` or `float64` `Tensor` or a Python number. The - power of the polynomial. Linear is default. Defaults to `1.0`. + power of the polynomial. Defaults to `1.0`. cycle: A boolean, whether it should cycle beyond decay_steps. Defaults to `False`. name: String. Optional name of the operation. Defaults to