From 07c45b1c8254bf2cc44dea11d4395593fb0f7b4b Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Thu, 25 Aug 2022 10:47:56 +0000 Subject: [PATCH 1/7] acceptance: link failing TestDockerCSharp to the proper issue Release justification: non-production code changes Release note: None --- pkg/acceptance/adapter_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/acceptance/adapter_test.go b/pkg/acceptance/adapter_test.go index de3722439b1e..32be85856f7d 100644 --- a/pkg/acceptance/adapter_test.go +++ b/pkg/acceptance/adapter_test.go @@ -39,7 +39,7 @@ func TestDockerC(t *testing.T) { } func TestDockerCSharp(t *testing.T) { - skip.WithIssue(t, 58218, "flaky test") + skip.WithIssue(t, 86852, "test not to up to date anymore") s := log.Scope(t) defer s.Close(t) From a98bf544d1c2adf5d2de0b54269e37cfbd530f61 Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Thu, 25 Aug 2022 10:48:31 +0000 Subject: [PATCH 2/7] acceptance: fix the test dir preparation for docker-under-bzl tests Prior to this change, the `defer` for the directory deletion could run before the docker shutdown sequence. This was incorrect, and could cause the docker container to fail too early. The result was that the docker controller would report a `die` event before the test was expecting it (in the `Assert` method), and the assert would fail. This commit fixes it. Release justification: non-production code changes Release note: None --- pkg/acceptance/util_docker.go | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pkg/acceptance/util_docker.go b/pkg/acceptance/util_docker.go index 3bfac33a67c9..a0f332840502 100644 --- a/pkg/acceptance/util_docker.go +++ b/pkg/acceptance/util_docker.go @@ -69,19 +69,6 @@ func testDocker( ) error { var err error RunDocker(t, func(t *testing.T) { - cfg := cluster.TestConfig{ - Name: name, - Duration: *flagDuration, - } - for i := 0; i < num; i++ { - cfg.Nodes = append(cfg.Nodes, cluster.NodeConfig{Stores: []cluster.StoreConfig{{}}}) - } - l := StartCluster(ctx, t, cfg).(*cluster.DockerCluster) - defer l.AssertAndStop(ctx, t) - - if len(l.Nodes) > 0 { - containerConfig.Env = append(containerConfig.Env, "PGHOST="+l.Hostname(0)) - } var pwd string pwd, err = os.Getwd() if err != nil { @@ -121,6 +108,25 @@ func testDocker( }() hostConfig.Binds = append(hostConfig.Binds, interactivetestsDir+":/mnt/interactive_tests") } + + // Prepare the docker cluster. + // We need to do this "under" the directory preparation above so as + // to prevent the test from crashing because the directory gets + // deleted before the container shutdown assertions get a chance to run. + cfg := cluster.TestConfig{ + Name: name, + Duration: *flagDuration, + } + for i := 0; i < num; i++ { + cfg.Nodes = append(cfg.Nodes, cluster.NodeConfig{Stores: []cluster.StoreConfig{{}}}) + } + l := StartCluster(ctx, t, cfg).(*cluster.DockerCluster) + defer l.AssertAndStop(ctx, t) + + if len(l.Nodes) > 0 { + containerConfig.Env = append(containerConfig.Env, "PGHOST="+l.Hostname(0)) + } + err = l.OneShot( ctx, acceptanceImage, types.ImagePullOptions{}, containerConfig, hostConfig, platforms.DefaultSpec(), "docker-"+name, From 24b86c9d27da941a8907720908fa8bbc5ac6eb7e Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Thu, 25 Aug 2022 11:44:33 +0000 Subject: [PATCH 3/7] acceptance: ensure the monitor is stopped before shutdown The cluster stop() method was not waiting for the monitor goroutine to actually shut down before it would go on to kill the nodes ungracefully. The result could be spurious `die` events in the final assertion. Release justification: non-production code changes Release note: None --- pkg/acceptance/cluster/dockercluster.go | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go index 9fb5bb22ac49..8d97932a3ea2 100644 --- a/pkg/acceptance/cluster/dockercluster.go +++ b/pkg/acceptance/cluster/dockercluster.go @@ -129,6 +129,7 @@ type DockerCluster struct { stopper *stop.Stopper monitorCtx context.Context monitorCtxCancelFunc func() + monitorDone chan struct{} clusterID string networkID string networkName string @@ -544,6 +545,7 @@ func (l *DockerCluster) processEvent(ctx context.Context, event events.Message) // If there's currently a oneshot container, ignore any die messages from // it because those are expected. if l.oneshot != nil && event.ID == l.oneshot.id && event.Status == eventDie { + log.Infof(ctx, "Docker event was: the oneshot container terminated") return true } @@ -585,7 +587,9 @@ func (l *DockerCluster) processEvent(ctx context.Context, event events.Message) return false } -func (l *DockerCluster) monitor(ctx context.Context) { +func (l *DockerCluster) monitor(ctx context.Context, monitorDone chan struct{}) { + defer close(monitorDone) + if log.V(1) { log.Infof(ctx, "events monitor starts") defer log.Infof(ctx, "events monitor exits") @@ -603,6 +607,9 @@ func (l *DockerCluster) monitor(ctx context.Context) { }) for { select { + case <-l.monitorCtx.Done(): + log.Infof(ctx, "monitor shutting down") + return false case err := <-errq: log.Infof(ctx, "event stream done, resetting...: %s", err) // Sometimes we get a random string-wrapped EOF error back. @@ -640,7 +647,8 @@ func (l *DockerCluster) Start(ctx context.Context) { log.Infof(ctx, "starting %d nodes", len(l.Nodes)) l.monitorCtx, l.monitorCtxCancelFunc = context.WithCancel(context.Background()) - go l.monitor(ctx) + l.monitorDone = make(chan struct{}) + go l.monitor(ctx, l.monitorDone) var wg sync.WaitGroup wg.Add(len(l.Nodes)) for _, node := range l.Nodes { @@ -713,6 +721,7 @@ func (l *DockerCluster) stop(ctx context.Context) { if l.monitorCtxCancelFunc != nil { l.monitorCtxCancelFunc() l.monitorCtxCancelFunc = nil + <-l.monitorDone } if l.vols != nil { From 45de70e4c9b45df6daf75e660f8cd6f10dd2ac0d Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Thu, 25 Aug 2022 11:48:07 +0000 Subject: [PATCH 4/7] acceptance/cluster: simplify the interface This clarifies that the container is not expected to run anymore at the end of a wait. Release justification: non-production code changes Release note: None --- pkg/acceptance/cluster/docker.go | 6 +++--- pkg/acceptance/cluster/dockercluster.go | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/acceptance/cluster/docker.go b/pkg/acceptance/cluster/docker.go index 33abb281c510..188f633a3e5f 100644 --- a/pkg/acceptance/cluster/docker.go +++ b/pkg/acceptance/cluster/docker.go @@ -287,9 +287,9 @@ func (c *Container) Restart(ctx context.Context, timeout *time.Duration) error { return nil } -// Wait waits for a running container to exit. -func (c *Container) Wait(ctx context.Context, condition container.WaitCondition) error { - waitOKBodyCh, errCh := c.cluster.client.ContainerWait(ctx, c.id, condition) +// WaitUntilNotRunning waits for a running container to exit. +func (c *Container) WaitUntilNotRunning(ctx context.Context) error { + waitOKBodyCh, errCh := c.cluster.client.ContainerWait(ctx, c.id, container.WaitConditionNotRunning) select { case err := <-errCh: return err diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go index 8d97932a3ea2..f17149330ad3 100644 --- a/pkg/acceptance/cluster/dockercluster.go +++ b/pkg/acceptance/cluster/dockercluster.go @@ -238,7 +238,7 @@ func (l *DockerCluster) OneShot( if err := l.oneshot.Start(ctx); err != nil { return err } - return l.oneshot.Wait(ctx, container.WaitConditionNotRunning) + return l.oneshot.WaitUntilNotRunning(ctx) } // stopOnPanic is invoked as a deferred function in Start in order to attempt @@ -375,7 +375,7 @@ func (l *DockerCluster) initCluster(ctx context.Context) { // and it'll get in the way of future runs. l.vols = c maybePanic(c.Start(ctx)) - maybePanic(c.Wait(ctx, container.WaitConditionNotRunning)) + maybePanic(c.WaitUntilNotRunning(ctx)) } // cockroachEntryPoint returns the value to be used as From 551c2e1affe4ef1db2f98cb43e0fe3adf8e9db44 Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Thu, 25 Aug 2022 11:50:25 +0000 Subject: [PATCH 5/7] acceptance: remove the volume ctn after the node containers (Since the latter depend on the former) Also, this preserves the Nodes slice, so it can be inspected with a debugger. Release justification: non-production code changes Release note: None --- pkg/acceptance/cluster/dockercluster.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go index f17149330ad3..dc1ab9330953 100644 --- a/pkg/acceptance/cluster/dockercluster.go +++ b/pkg/acceptance/cluster/dockercluster.go @@ -192,7 +192,7 @@ func CreateDocker( func (l *DockerCluster) expectEvent(c *Container, msgs ...string) { for index, ctr := range l.Nodes { - if c.id != ctr.id { + if ctr.Container == nil || c.id != ctr.id { continue } for _, status := range msgs { @@ -724,11 +724,6 @@ func (l *DockerCluster) stop(ctx context.Context) { <-l.monitorDone } - if l.vols != nil { - maybePanic(l.vols.Kill(ctx)) - maybePanic(l.vols.Remove(ctx)) - l.vols = nil - } for i, n := range l.Nodes { if n.Container == nil { continue @@ -751,8 +746,14 @@ func (l *DockerCluster) stop(ctx context.Context) { log.Infof(ctx, "~~~ node %d CRASHED ~~~~", i) } maybePanic(n.Remove(ctx)) + n.Container = nil + } + + if l.vols != nil { + maybePanic(l.vols.Kill(ctx)) + maybePanic(l.vols.Remove(ctx)) + l.vols = nil } - l.Nodes = nil if l.networkID != "" { maybePanic( From 032539cc28bc00735a4e607f5d0a96cb7e3d64f7 Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Thu, 25 Aug 2022 12:21:51 +0000 Subject: [PATCH 6/7] acceptance: ensure the vol directory is only deleted after shutdown Prior to this patch, the docker volume directory was deleted before the cluster nodes were shut down, which could cause them to crash abnormally and have the Assert method return an unexpected failure, even when the test would otherwise succeed. This patch fixes it. Release justification: non-production code changes Release note: None --- pkg/acceptance/cluster/dockercluster.go | 1 + pkg/acceptance/util_docker.go | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go index dc1ab9330953..45f77362bf58 100644 --- a/pkg/acceptance/cluster/dockercluster.go +++ b/pkg/acceptance/cluster/dockercluster.go @@ -888,6 +888,7 @@ func (l *DockerCluster) Cleanup(ctx context.Context, preserveLogs bool) { } for _, v := range volumes { if preserveLogs && v.Name() == "logs" { + log.Infof(ctx, "preserving log directory: %s", l.volumesDir) continue } if err := os.RemoveAll(filepath.Join(l.volumesDir, v.Name())); err != nil { diff --git a/pkg/acceptance/util_docker.go b/pkg/acceptance/util_docker.go index a0f332840502..8104e2f59a14 100644 --- a/pkg/acceptance/util_docker.go +++ b/pkg/acceptance/util_docker.go @@ -22,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/build/bazel" "github.com/cockroachdb/cockroach/pkg/security/username" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/containerd/containerd/platforms" "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" @@ -121,18 +122,33 @@ func testDocker( cfg.Nodes = append(cfg.Nodes, cluster.NodeConfig{Stores: []cluster.StoreConfig{{}}}) } l := StartCluster(ctx, t, cfg).(*cluster.DockerCluster) - defer l.AssertAndStop(ctx, t) + + var preserveLogs bool + defer func() { + // Check the final health of the cluster nodes and + // stop the cluster after that. + l.AssertAndStop(ctx, t) + + // Note: we must be careful to clean up the volumes *after* + // the cluster has been shut down (in the `AssertAndStop` call). + // Otherwise, the directory removal will cause the cluster nodes + // to crash and report abnormal termination, even when the test + // succeeds otherwise. + log.Infof(ctx, "cleaning up docker volume") + l.Cleanup(ctx, preserveLogs) + }() if len(l.Nodes) > 0 { containerConfig.Env = append(containerConfig.Env, "PGHOST="+l.Hostname(0)) } + log.Infof(ctx, "starting one-shot container") err = l.OneShot( ctx, acceptanceImage, types.ImagePullOptions{}, containerConfig, hostConfig, platforms.DefaultSpec(), "docker-"+name, ) - preserveLogs := err != nil - l.Cleanup(ctx, preserveLogs) + log.Infof(ctx, "one-shot container terminated: %v", err) + preserveLogs = err != nil }) return err } From 9b4fd921b97d4cefe20f02cea82ab43adc64093c Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Thu, 25 Aug 2022 12:23:51 +0000 Subject: [PATCH 7/7] acceptance: avoid races in the health assertion Prior to this patch, the Assert code was reading events from the docker event channel too fast, possibly missing some events in the process. It is because the events were read too fast that the problem fixed in the previous commit did not always result in a test crash/failure (i.e. the assert was too fast to do its job properly in most cases). By slowing down the reading of events from docker, we can surface logic error in the surrounding test framework more clearly. Release justification: non-production code changes Release note: None --- pkg/acceptance/cluster/dockercluster.go | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go index 45f77362bf58..d88a3fa3757c 100644 --- a/pkg/acceptance/cluster/dockercluster.go +++ b/pkg/acceptance/cluster/dockercluster.go @@ -669,7 +669,6 @@ func (l *DockerCluster) Start(ctx context.Context) { // the cluster (restart, kill, ...). In the event of a mismatch, the passed // Tester receives a fatal error. func (l *DockerCluster) Assert(ctx context.Context, t testing.TB) { - const almostZero = 50 * time.Millisecond filter := func(ch chan Event, wait time.Duration) *Event { select { case act := <-ch: @@ -681,17 +680,28 @@ func (l *DockerCluster) Assert(ctx context.Context, t testing.TB) { var events []Event for { + // The expected event channel is buffered and should contain + // all expected events already. + const almostZero = 15 * time.Millisecond exp := filter(l.expectedEvents, almostZero) if exp == nil { break } - act := filter(l.events, 15*time.Second) + t.Logf("expecting event: %v", exp) + // l.events is connected to the docker controller and may + // receive events more slowly. + const waitForDockerEvent = 15 * time.Second + act := filter(l.events, waitForDockerEvent) + t.Logf("got event: %v", act) if act == nil || *exp != *act { t.Fatalf("expected event %v, got %v (after %v)", exp, act, events) } events = append(events, *exp) } - if cur := filter(l.events, almostZero); cur != nil { + // At the end, we leave docker a bit more time to report a final event, + // if any. + const waitForLastDockerEvent = 1 * time.Second + if cur := filter(l.events, waitForLastDockerEvent); cur != nil { t.Fatalf("unexpected extra event %v (after %v)", cur, events) } if log.V(2) {