Skip to content

Commit

Permalink
acme/autocert: fix races in renewal tests
Browse files Browse the repository at this point in the history
TestRenewFromCache and TestRenewFromCacheAlreadyRenewed had several
races and API misuses:

1. They called t.Fatalf from a goroutine other than the one invoking
   the Test function, which is explicitly disallowed (see
   https://pkg.go.dev/testing#T).

2. The test did not stop the renewal timers prior to restoring
   test-hook functions, and the process of stopping the renewal timers
   itself did not wait for in-flight calls to complete. That could
   cause data races if one of the renewals failed and triggered a
   retry with a short-enough randomized backoff.
   (One such race was observed in
   https://build.golang.org/log/1a19e22ad826bedeb5a939c6130f368f9979208a.)

3. The testDidRenewLoop hooks accessed the Manager.renewal field
   without locking the Mutex guarding that field.

4. TestGetCertificate_failedAttempt set a testDidRemoveState hook, but
   didn't wait for the timers referring to that hook to complete
   before restoring it, causing races with other timers. I tried
   pulling on that thread a bit, but couldn't untangle the numerous
   untracked goroutines in the package. Instead, I have made a smaller
   and more local change to copy the value of testDidRemoveState into
   a local variable in the timer's closure.

Given the number of untracked goroutines in this package, it is likely
that races and/or deadlocks remain. Notably, so far I have been unable
to spot the actual cause of golang/go#51080.

For golang/go#51080

Change-Id: I7797f6ac34ef3c272f16ca805251dac3aa7f0009
Reviewed-on: https://go-review.googlesource.com/c/crypto/+/384594
Trust: Bryan Mills <[email protected]>
Run-TryBot: Bryan Mills <[email protected]>
Reviewed-by: Filippo Valsorda <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
  • Loading branch information
Bryan C. Mills committed Feb 10, 2022
1 parent db63837 commit f4118a5
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 58 deletions.
11 changes: 6 additions & 5 deletions acme/autocert/autocert.go
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ func (m *Manager) cert(ctx context.Context, ck certKey) (*tls.Certificate, error
leaf: cert.Leaf,
}
m.state[ck] = s
go m.renew(ck, s.key, s.leaf.NotAfter)
go m.startRenew(ck, s.key, s.leaf.NotAfter)
return cert, nil
}

Expand Down Expand Up @@ -584,8 +584,9 @@ func (m *Manager) createCert(ctx context.Context, ck certKey) (*tls.Certificate,
if err != nil {
// Remove the failed state after some time,
// making the manager call createCert again on the following TLS hello.
didRemove := testDidRemoveState // The lifetime of this timer is untracked, so copy mutable local state to avoid races.
time.AfterFunc(createCertRetryAfter, func() {
defer testDidRemoveState(ck)
defer didRemove(ck)
m.stateMu.Lock()
defer m.stateMu.Unlock()
// Verify the state hasn't changed and it's still invalid
Expand All @@ -603,7 +604,7 @@ func (m *Manager) createCert(ctx context.Context, ck certKey) (*tls.Certificate,
}
state.cert = der
state.leaf = leaf
go m.renew(ck, state.key, state.leaf.NotAfter)
go m.startRenew(ck, state.key, state.leaf.NotAfter)
return state.tlscert()
}

Expand Down Expand Up @@ -893,15 +894,15 @@ func httpTokenCacheKey(tokenPath string) string {
return path.Base(tokenPath) + "+http-01"
}

// renew starts a cert renewal timer loop, one per domain.
// startRenew starts a cert renewal timer loop, one per domain.
//
// The loop is scheduled in two cases:
// - a cert was fetched from cache for the first time (wasn't in m.state)
// - a new cert was created by m.createCert
//
// The key argument is a certificate private key.
// The exp argument is the cert expiration time (NotAfter).
func (m *Manager) renew(ck certKey, key crypto.Signer, exp time.Time) {
func (m *Manager) startRenew(ck certKey, key crypto.Signer, exp time.Time) {
m.renewalMu.Lock()
defer m.renewalMu.Unlock()
if m.renewal[ck] != nil {
Expand Down
35 changes: 25 additions & 10 deletions acme/autocert/renewal.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ type domainRenewal struct {
ck certKey
key crypto.Signer

timerMu sync.Mutex
timer *time.Timer
timerMu sync.Mutex
timer *time.Timer
timerClose chan struct{} // if non-nil, renew closes this channel (and nils out the timer fields) instead of running
}

// start starts a cert renewal timer at the time
Expand All @@ -38,24 +39,38 @@ func (dr *domainRenewal) start(exp time.Time) {
dr.timer = time.AfterFunc(dr.next(exp), dr.renew)
}

// stop stops the cert renewal timer.
// If the timer is already stopped, calling stop is a noop.
// stop stops the cert renewal timer and waits for any in-flight calls to renew
// to complete. If the timer is already stopped, calling stop is a noop.
func (dr *domainRenewal) stop() {
dr.timerMu.Lock()
defer dr.timerMu.Unlock()
if dr.timer == nil {
return
for {
if dr.timer == nil {
return
}
if dr.timer.Stop() {
dr.timer = nil
return
} else {
// dr.timer fired, and we acquired dr.timerMu before the renew callback did.
// (We know this because otherwise the renew callback would have reset dr.timer!)
timerClose := make(chan struct{})
dr.timerClose = timerClose
dr.timerMu.Unlock()
<-timerClose
dr.timerMu.Lock()
}
}
dr.timer.Stop()
dr.timer = nil
}

// renew is called periodically by a timer.
// The first renew call is kicked off by dr.start.
func (dr *domainRenewal) renew() {
dr.timerMu.Lock()
defer dr.timerMu.Unlock()
if dr.timer == nil {
if dr.timerClose != nil {
close(dr.timerClose)
dr.timer, dr.timerClose = nil, nil
return
}

Expand All @@ -67,8 +82,8 @@ func (dr *domainRenewal) renew() {
next = renewJitter / 2
next += time.Duration(pseudoRand.int63n(int64(next)))
}
dr.timer = time.AfterFunc(next, dr.renew)
testDidRenewLoop(next, err)
dr.timer = time.AfterFunc(next, dr.renew)
}

// updateState locks and replaces the relevant Manager.state item with the given
Expand Down
110 changes: 67 additions & 43 deletions acme/autocert/renewal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,23 @@ func TestRenewFromCache(t *testing.T) {

// verify the renewal happened
defer func() {
// Stop the timers that read and execute testDidRenewLoop before restoring it.
// Otherwise the timer callback may race with the deferred write.
man.stopRenew()
testDidRenewLoop = func(next time.Duration, err error) {}
}()
done := make(chan struct{})
renewed := make(chan bool, 1)
testDidRenewLoop = func(next time.Duration, err error) {
defer close(done)
defer func() {
select {
case renewed <- true:
default:
// The renewal timer uses a random backoff. If the first renewal fails for
// some reason, we could end up with multiple calls here before the test
// stops the timer.
}
}()

if err != nil {
t.Errorf("testDidRenewLoop: %v", err)
}
Expand All @@ -81,7 +93,8 @@ func TestRenewFromCache(t *testing.T) {
after := time.Now().Add(future)
tlscert, err := man.cacheGet(context.Background(), exampleCertKey)
if err != nil {
t.Fatalf("man.cacheGet: %v", err)
t.Errorf("man.cacheGet: %v", err)
return
}
if !tlscert.Leaf.NotAfter.After(after) {
t.Errorf("cache leaf.NotAfter = %v; want > %v", tlscert.Leaf.NotAfter, after)
Expand All @@ -92,11 +105,13 @@ func TestRenewFromCache(t *testing.T) {
defer man.stateMu.Unlock()
s := man.state[exampleCertKey]
if s == nil {
t.Fatalf("m.state[%q] is nil", exampleCertKey)
t.Errorf("m.state[%q] is nil", exampleCertKey)
return
}
tlscert, err = s.tlscert()
if err != nil {
t.Fatalf("s.tlscert: %v", err)
t.Errorf("s.tlscert: %v", err)
return
}
if !tlscert.Leaf.NotAfter.After(after) {
t.Errorf("state leaf.NotAfter = %v; want > %v", tlscert.Leaf.NotAfter, after)
Expand All @@ -108,13 +123,7 @@ func TestRenewFromCache(t *testing.T) {
if _, err := man.GetCertificate(hello); err != nil {
t.Fatal(err)
}

// wait for renew loop
select {
case <-time.After(10 * time.Second):
t.Fatal("renew took too long to occur")
case <-done:
}
<-renewed
}

func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
Expand Down Expand Up @@ -159,11 +168,23 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {

// verify the renewal accepted the newer cached cert
defer func() {
// Stop the timers that read and execute testDidRenewLoop before restoring it.
// Otherwise the timer callback may race with the deferred write.
man.stopRenew()
testDidRenewLoop = func(next time.Duration, err error) {}
}()
done := make(chan struct{})
renewed := make(chan bool, 1)
testDidRenewLoop = func(next time.Duration, err error) {
defer close(done)
defer func() {
select {
case renewed <- true:
default:
// The renewal timer uses a random backoff. If the first renewal fails for
// some reason, we could end up with multiple calls here before the test
// stops the timer.
}
}()

if err != nil {
t.Errorf("testDidRenewLoop: %v", err)
}
Expand All @@ -177,7 +198,8 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
// ensure the cached cert was not modified
tlscert, err := man.cacheGet(context.Background(), exampleCertKey)
if err != nil {
t.Fatalf("man.cacheGet: %v", err)
t.Errorf("man.cacheGet: %v", err)
return
}
if !tlscert.Leaf.NotAfter.Equal(newLeaf.NotAfter) {
t.Errorf("cache leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
Expand All @@ -188,30 +210,22 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
defer man.stateMu.Unlock()
s := man.state[exampleCertKey]
if s == nil {
t.Fatalf("m.state[%q] is nil", exampleCertKey)
t.Errorf("m.state[%q] is nil", exampleCertKey)
return
}
stateKey := s.key.Public().(*ecdsa.PublicKey)
if !stateKey.Equal(newLeaf.PublicKey) {
t.Fatal("state key was not updated from cache")
t.Error("state key was not updated from cache")
return
}
tlscert, err = s.tlscert()
if err != nil {
t.Fatalf("s.tlscert: %v", err)
t.Errorf("s.tlscert: %v", err)
return
}
if !tlscert.Leaf.NotAfter.Equal(newLeaf.NotAfter) {
t.Errorf("state leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
}

// verify the private key is replaced in the renewal state
r := man.renewal[exampleCertKey]
if r == nil {
t.Fatalf("m.renewal[%q] is nil", exampleCertKey)
}
renewalKey := r.key.Public().(*ecdsa.PublicKey)
if !renewalKey.Equal(newLeaf.PublicKey) {
t.Fatal("renewal private key was not updated from cache")
}

}

// assert the expiring cert is returned from state
Expand All @@ -225,21 +239,31 @@ func TestRenewFromCacheAlreadyRenewed(t *testing.T) {
}

// trigger renew
go man.renew(exampleCertKey, s.key, s.leaf.NotAfter)

// wait for renew loop
select {
case <-time.After(10 * time.Second):
t.Fatal("renew took too long to occur")
case <-done:
// assert the new cert is returned from state after renew
hello := clientHelloInfo(exampleDomain, algECDSA)
tlscert, err := man.GetCertificate(hello)
if err != nil {
t.Fatal(err)
man.startRenew(exampleCertKey, s.key, s.leaf.NotAfter)
<-renewed
func() {
man.renewalMu.Lock()
defer man.renewalMu.Unlock()

// verify the private key is replaced in the renewal state
r := man.renewal[exampleCertKey]
if r == nil {
t.Errorf("m.renewal[%q] is nil", exampleCertKey)
return
}
if !newLeaf.NotAfter.Equal(tlscert.Leaf.NotAfter) {
t.Errorf("state leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
renewalKey := r.key.Public().(*ecdsa.PublicKey)
if !renewalKey.Equal(newLeaf.PublicKey) {
t.Error("renewal private key was not updated from cache")
}
}()

// assert the new cert is returned from state after renew
hello = clientHelloInfo(exampleDomain, algECDSA)
tlscert, err = man.GetCertificate(hello)
if err != nil {
t.Fatal(err)
}
if !newLeaf.NotAfter.Equal(tlscert.Leaf.NotAfter) {
t.Errorf("state leaf.NotAfter = %v; want == %v", tlscert.Leaf.NotAfter, newLeaf.NotAfter)
}
}

0 comments on commit f4118a5

Please sign in to comment.