Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix various flaky tests #16396

Merged
merged 3 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions agent/grpc-external/services/peerstream/stream_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1252,8 +1252,8 @@ func TestStreamResources_Server_DisconnectsOnHeartbeatTimeout(t *testing.T) {
})

testutil.RunStep(t, "stream is disconnected due to heartbeat timeout", func(t *testing.T) {
disconnectTime := ptr(it.FutureNow(1))
Copy link
Contributor Author

@kisunji kisunji Feb 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

incrementalTime it was being modified in a goroutine so there was a data race, leading disconnectTime to be sometimes incorrectly early. I moved the call to be local to each retry loop so that it would fetch the latest "now" value instead of predicting it would be 1 second in the future.

retry.Run(t, func(r *retry.R) {
disconnectTime := ptr(it.StaticNow())
status, ok := srv.StreamStatus(testPeerID)
require.True(r, ok)
require.False(r, status.Connected)
Expand Down Expand Up @@ -1423,7 +1423,7 @@ func makeClient(t *testing.T, srv *testServer, peerID string) *MockClient {
},
}))

// Receive a services and roots subscription request pair from server
// Receive ExportedService, ExportedServiceList, and PeeringTrustBundle subscription requests from server
receivedSub1, err := client.Recv()
require.NoError(t, err)
receivedSub2, err := client.Recv()
Expand Down
10 changes: 10 additions & 0 deletions agent/grpc-external/services/peerstream/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,16 @@ func (t *incrementalTime) Now() time.Time {
return t.base.Add(dur)
}

// StaticNow returns the current internal clock without advancing it.
func (t *incrementalTime) StaticNow() time.Time {
t.mu.Lock()
defer t.mu.Unlock()

dur := time.Duration(t.next) * time.Second

return t.base.Add(dur)
}

// FutureNow will return a given future value of the Now() function.
// The numerical argument indicates which future Now value you wanted. The
// value must be > 0.
Expand Down
47 changes: 28 additions & 19 deletions agent/xds/delta_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (

"github.com/armon/go-metrics"
envoy_discovery_v3 "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
"github.com/hashicorp/consul/api"
"github.com/stretchr/testify/require"
rpcstatus "google.golang.org/genproto/googleapis/rpc/status"
"google.golang.org/grpc/codes"
Expand All @@ -21,8 +20,10 @@ import (
"github.com/hashicorp/consul/agent/grpc-external/limiter"
"github.com/hashicorp/consul/agent/proxycfg"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/envoyextensions/xdscommon"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
"github.com/hashicorp/consul/version"
)

Expand Down Expand Up @@ -1057,19 +1058,23 @@ func TestServer_DeltaAggregatedResources_v3_ACLEnforcement(t *testing.T) {

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// aclResolve may be called in a goroutine even after a
// testcase tt returns. Capture the variable as tc so the
// values don't swap in the next iteration.
tc := tt
aclResolve := func(id string) (acl.Authorizer, error) {
if !tt.defaultDeny {
if !tc.defaultDeny {
// Allow all
return acl.RootAuthorizer("allow"), nil
}
if tt.acl == "" {
if tc.acl == "" {
// No token and defaultDeny is denied
return acl.RootAuthorizer("deny"), nil
}
// Ensure the correct token was passed
require.Equal(t, tt.token, id)
require.Equal(t, tc.token, id)
// Parse the ACL and enforce it
policy, err := acl.NewPolicyFromSource(tt.acl, nil, nil)
policy, err := acl.NewPolicyFromSource(tc.acl, nil, nil)
require.NoError(t, err)
return acl.NewPolicyAuthorizerWithDefaults(acl.RootAuthorizer("deny"), []*acl.Policy{policy}, nil)
}
Expand All @@ -1095,13 +1100,15 @@ func TestServer_DeltaAggregatedResources_v3_ACLEnforcement(t *testing.T) {

// If there is no token, check that we increment the gauge
if tt.token == "" {
data := scenario.sink.Data()
require.Len(t, data, 1)

item := data[0]
val, ok := item.Gauges["consul.xds.test.xds.server.streamsUnauthenticated"]
require.True(t, ok)
require.Equal(t, float32(1), val.Value)
retry.Run(t, func(r *retry.R) {
data := scenario.sink.Data()
require.Len(r, data, 1)

item := data[0]
val, ok := item.Gauges["consul.xds.test.xds.server.streamsUnauthenticated"]
require.True(r, ok)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assertion was failing randomly; I'm not 100% confident this will fix the issue but since scenario.sink.Data is timing-based I'm hoping retries will help here.

Copy link
Contributor

@NiniOak NiniOak Feb 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the retry might fix the issue. When reading this code earlier, I assumed this assertion was failing randomly because this key consul.xds.test.xds.server.streamsUnauthenticated isn't always found in the data from #L1101.

  • #L1102 validates that data exists and the value isn't empty; which leads me to assume that the key requested isn't found in #L1105
  • Is there a precise set of data that we use for testing that will always return this key/value?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The gauge should be incremented in a normal codepath that is taken during the test: 1, 2

I think the tests could be flaking because either:

  1. There is some timing issue with the in-memory metrics sink
  2. There is a data race between goroutines within the test

I didn't spend too much time looking into it but added a bunch of fixes for possible culprits

require.Equal(r, float32(1), val.Value)
})
}

if !tt.wantDenied {
Expand Down Expand Up @@ -1138,13 +1145,15 @@ func TestServer_DeltaAggregatedResources_v3_ACLEnforcement(t *testing.T) {

// If there is no token, check that we decrement the gauge
if tt.token == "" {
data := scenario.sink.Data()
require.Len(t, data, 1)

item := data[0]
val, ok := item.Gauges["consul.xds.test.xds.server.streamsUnauthenticated"]
require.True(t, ok)
require.Equal(t, float32(0), val.Value)
retry.Run(t, func(r *retry.R) {
data := scenario.sink.Data()
require.Len(r, data, 1)

item := data[0]
val, ok := item.Gauges["consul.xds.test.xds.server.streamsUnauthenticated"]
require.True(r, ok)
require.Equal(r, float32(0), val.Value)
})
}
})
}
Expand Down
4 changes: 1 addition & 3 deletions agent/xds/xds_protocol_helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,6 @@ func newTestServerDeltaScenario(
) *testServerScenario {
mgr := newTestManager(t)
envoy := NewTestEnvoy(t, proxyID, token)
t.Cleanup(func() {
envoy.Close()
})
Comment on lines -169 to -171
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cleanups and defers are run in LIFO order. I wanted to ensure envoy.Close() got called before the global metrics sink was reset


sink := metrics.NewInmemSink(1*time.Minute, 1*time.Minute)
cfg := metrics.DefaultConfig("consul.xds.test")
Expand All @@ -177,6 +174,7 @@ func newTestServerDeltaScenario(
metrics.NewGlobal(cfg, sink)

t.Cleanup(func() {
envoy.Close()
sink := &metrics.BlackholeSink{}
metrics.NewGlobal(cfg, sink)
})
Expand Down
3 changes: 2 additions & 1 deletion command/debug/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,8 @@ func (c *cmd) prepare() (version string, err error) {
// If none are specified we will collect information from
// all by default
if len(c.capture) == 0 {
c.capture = defaultTargets
c.capture = make([]string, len(defaultTargets))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very fun slice debugging exercise where the code block below was mutating defaultTargets which affected tests running in parallel

copy(c.capture, defaultTargets)
}

// If EnableDebug is not true, skip collecting pprof
Expand Down