Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add weightdecay to Adam #2415

Merged
merged 5 commits into from
Jul 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyzoo/zoo/pipeline/api/keras/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(self,
epsilon=1e-8,
decay=0.0,
schedule=None,
weight_decay=0.0,
bigdl_type="float"):
"""
:param lr learning rate
Expand All @@ -60,13 +61,15 @@ def __init__(self,
beta_2,
epsilon,
decay,
weight_decay,
schedule if (schedule) else Default()
)
self.bigdl_type = bigdl_type


class AdamWeightDecay(OptimMethod, ZooKerasCreator):
"""
Implements BERT version of Adam algorithm.
>>> adam = AdamWeightDecay()
creating: createZooKerasAdamWeightDecay
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,10 @@ class Adam[@specialized(Float, Double) T: ClassTag](
var beta_2: Double = 0.999,
var epsilon: Double = 1e-8,
var decay: Double = 0.0,
var wDecay: Double = 0.0,
val schedule: LearningRateSchedule = Default()
)(implicit ev: TensorNumeric[T]) extends SGD[T](learningRate = lr,
learningRateDecay = decay, learningRateSchedule = schedule) {
learningRateDecay = decay, weightDecay = wDecay, learningRateSchedule = schedule) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't you directly pass weightDecay to it parent class SGD so that you can directly extend SGD's implementation?


@transient
private var buffer: Tensor[T] = null
Expand All @@ -65,6 +66,7 @@ class Adam[@specialized(Float, Double) T: ClassTag](
val beta1 = this.beta_1
val beta2 = this.beta_2
val eps = this.epsilon
val wd = this.wDecay

val (fx, dfdx) = feval(parameter)
val state = SGDRef.getstate(this)
Expand All @@ -80,6 +82,10 @@ class Adam[@specialized(Float, Double) T: ClassTag](

val clr = - this.schedule.currentRate

if(wd > 0) {
dfdx.add(parameter * (ev.fromType(wd)))
}

/**
* m_t = beta_1 * m_t-1 + (1 - beta_1) * g_t
* v_t = beta_2 * v_t-1 + (1 - beta_2) * g_t * g_t
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1032,9 +1032,10 @@ class PythonZooKeras[T: ClassTag](implicit ev: TensorNumeric[T]) extends PythonZ
beta_2: Double = 0.999,
epsilon: Double = 1e-8,
decay: Double = 0.0,
weightDecay: Double = 0.0,
schedule: SGD.LearningRateSchedule = SGD.Default()
): Adam[T] = {
new Adam[T](lr, beta_1, beta_2, epsilon, decay, schedule)
new Adam[T](lr, beta_1, beta_2, epsilon, decay, weightDecay, schedule)
}

def createZooKerasHardShrink(
Expand Down