From 07c45b1c8254bf2cc44dea11d4395593fb0f7b4b Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 25 Aug 2022 10:47:56 +0000
Subject: [PATCH 1/7] acceptance: link failing TestDockerCSharp to the proper
 issue

Release justification: non-production code changes

Release note: None
---
 pkg/acceptance/adapter_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/acceptance/adapter_test.go b/pkg/acceptance/adapter_test.go
index de3722439b1e..32be85856f7d 100644
--- a/pkg/acceptance/adapter_test.go
+++ b/pkg/acceptance/adapter_test.go
@@ -39,7 +39,7 @@ func TestDockerC(t *testing.T) {
 }
 
 func TestDockerCSharp(t *testing.T) {
-	skip.WithIssue(t, 58218, "flaky test")
+	skip.WithIssue(t, 86852, "test not to up to date anymore")
 	s := log.Scope(t)
 	defer s.Close(t)
 

From a98bf544d1c2adf5d2de0b54269e37cfbd530f61 Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 25 Aug 2022 10:48:31 +0000
Subject: [PATCH 2/7] acceptance: fix the test dir preparation for
 docker-under-bzl tests

Prior to this change, the `defer` for the directory deletion could
run before the docker shutdown sequence. This was incorrect,
and could cause the docker container to fail too early.

The result was that the docker controller would report a `die` event
before the test was expecting it (in the `Assert` method), and the
assert would fail.

This commit fixes it.

Release justification: non-production code changes

Release note: None
---
 pkg/acceptance/util_docker.go | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/pkg/acceptance/util_docker.go b/pkg/acceptance/util_docker.go
index 3bfac33a67c9..a0f332840502 100644
--- a/pkg/acceptance/util_docker.go
+++ b/pkg/acceptance/util_docker.go
@@ -69,19 +69,6 @@ func testDocker(
 ) error {
 	var err error
 	RunDocker(t, func(t *testing.T) {
-		cfg := cluster.TestConfig{
-			Name:     name,
-			Duration: *flagDuration,
-		}
-		for i := 0; i < num; i++ {
-			cfg.Nodes = append(cfg.Nodes, cluster.NodeConfig{Stores: []cluster.StoreConfig{{}}})
-		}
-		l := StartCluster(ctx, t, cfg).(*cluster.DockerCluster)
-		defer l.AssertAndStop(ctx, t)
-
-		if len(l.Nodes) > 0 {
-			containerConfig.Env = append(containerConfig.Env, "PGHOST="+l.Hostname(0))
-		}
 		var pwd string
 		pwd, err = os.Getwd()
 		if err != nil {
@@ -121,6 +108,25 @@ func testDocker(
 			}()
 			hostConfig.Binds = append(hostConfig.Binds, interactivetestsDir+":/mnt/interactive_tests")
 		}
+
+		// Prepare the docker cluster.
+		// We need to do this "under" the directory preparation above so as
+		// to prevent the test from crashing because the directory gets
+		// deleted before the container shutdown assertions get a chance to run.
+		cfg := cluster.TestConfig{
+			Name:     name,
+			Duration: *flagDuration,
+		}
+		for i := 0; i < num; i++ {
+			cfg.Nodes = append(cfg.Nodes, cluster.NodeConfig{Stores: []cluster.StoreConfig{{}}})
+		}
+		l := StartCluster(ctx, t, cfg).(*cluster.DockerCluster)
+		defer l.AssertAndStop(ctx, t)
+
+		if len(l.Nodes) > 0 {
+			containerConfig.Env = append(containerConfig.Env, "PGHOST="+l.Hostname(0))
+		}
+
 		err = l.OneShot(
 			ctx, acceptanceImage, types.ImagePullOptions{}, containerConfig, hostConfig,
 			platforms.DefaultSpec(), "docker-"+name,

From 24b86c9d27da941a8907720908fa8bbc5ac6eb7e Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 25 Aug 2022 11:44:33 +0000
Subject: [PATCH 3/7] acceptance: ensure the monitor is stopped before shutdown

The cluster stop() method was not waiting for the monitor
goroutine to actually shut down before it would go on
to kill the nodes ungracefully. The result could be spurious
`die` events in the final assertion.

Release justification: non-production code changes

Release note: None
---
 pkg/acceptance/cluster/dockercluster.go | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go
index 9fb5bb22ac49..8d97932a3ea2 100644
--- a/pkg/acceptance/cluster/dockercluster.go
+++ b/pkg/acceptance/cluster/dockercluster.go
@@ -129,6 +129,7 @@ type DockerCluster struct {
 	stopper              *stop.Stopper
 	monitorCtx           context.Context
 	monitorCtxCancelFunc func()
+	monitorDone          chan struct{}
 	clusterID            string
 	networkID            string
 	networkName          string
@@ -544,6 +545,7 @@ func (l *DockerCluster) processEvent(ctx context.Context, event events.Message)
 	// If there's currently a oneshot container, ignore any die messages from
 	// it because those are expected.
 	if l.oneshot != nil && event.ID == l.oneshot.id && event.Status == eventDie {
+		log.Infof(ctx, "Docker event was: the oneshot container terminated")
 		return true
 	}
 
@@ -585,7 +587,9 @@ func (l *DockerCluster) processEvent(ctx context.Context, event events.Message)
 	return false
 }
 
-func (l *DockerCluster) monitor(ctx context.Context) {
+func (l *DockerCluster) monitor(ctx context.Context, monitorDone chan struct{}) {
+	defer close(monitorDone)
+
 	if log.V(1) {
 		log.Infof(ctx, "events monitor starts")
 		defer log.Infof(ctx, "events monitor exits")
@@ -603,6 +607,9 @@ func (l *DockerCluster) monitor(ctx context.Context) {
 		})
 		for {
 			select {
+			case <-l.monitorCtx.Done():
+				log.Infof(ctx, "monitor shutting down")
+				return false
 			case err := <-errq:
 				log.Infof(ctx, "event stream done, resetting...: %s", err)
 				// Sometimes we get a random string-wrapped EOF error back.
@@ -640,7 +647,8 @@ func (l *DockerCluster) Start(ctx context.Context) {
 
 	log.Infof(ctx, "starting %d nodes", len(l.Nodes))
 	l.monitorCtx, l.monitorCtxCancelFunc = context.WithCancel(context.Background())
-	go l.monitor(ctx)
+	l.monitorDone = make(chan struct{})
+	go l.monitor(ctx, l.monitorDone)
 	var wg sync.WaitGroup
 	wg.Add(len(l.Nodes))
 	for _, node := range l.Nodes {
@@ -713,6 +721,7 @@ func (l *DockerCluster) stop(ctx context.Context) {
 	if l.monitorCtxCancelFunc != nil {
 		l.monitorCtxCancelFunc()
 		l.monitorCtxCancelFunc = nil
+		<-l.monitorDone
 	}
 
 	if l.vols != nil {

From 45de70e4c9b45df6daf75e660f8cd6f10dd2ac0d Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 25 Aug 2022 11:48:07 +0000
Subject: [PATCH 4/7] acceptance/cluster: simplify the interface

This clarifies that the container is not expected
to run anymore at the end of a wait.

Release justification: non-production code changes

Release note: None
---
 pkg/acceptance/cluster/docker.go        | 6 +++---
 pkg/acceptance/cluster/dockercluster.go | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pkg/acceptance/cluster/docker.go b/pkg/acceptance/cluster/docker.go
index 33abb281c510..188f633a3e5f 100644
--- a/pkg/acceptance/cluster/docker.go
+++ b/pkg/acceptance/cluster/docker.go
@@ -287,9 +287,9 @@ func (c *Container) Restart(ctx context.Context, timeout *time.Duration) error {
 	return nil
 }
 
-// Wait waits for a running container to exit.
-func (c *Container) Wait(ctx context.Context, condition container.WaitCondition) error {
-	waitOKBodyCh, errCh := c.cluster.client.ContainerWait(ctx, c.id, condition)
+// WaitUntilNotRunning waits for a running container to exit.
+func (c *Container) WaitUntilNotRunning(ctx context.Context) error {
+	waitOKBodyCh, errCh := c.cluster.client.ContainerWait(ctx, c.id, container.WaitConditionNotRunning)
 	select {
 	case err := <-errCh:
 		return err
diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go
index 8d97932a3ea2..f17149330ad3 100644
--- a/pkg/acceptance/cluster/dockercluster.go
+++ b/pkg/acceptance/cluster/dockercluster.go
@@ -238,7 +238,7 @@ func (l *DockerCluster) OneShot(
 	if err := l.oneshot.Start(ctx); err != nil {
 		return err
 	}
-	return l.oneshot.Wait(ctx, container.WaitConditionNotRunning)
+	return l.oneshot.WaitUntilNotRunning(ctx)
 }
 
 // stopOnPanic is invoked as a deferred function in Start in order to attempt
@@ -375,7 +375,7 @@ func (l *DockerCluster) initCluster(ctx context.Context) {
 	// and it'll get in the way of future runs.
 	l.vols = c
 	maybePanic(c.Start(ctx))
-	maybePanic(c.Wait(ctx, container.WaitConditionNotRunning))
+	maybePanic(c.WaitUntilNotRunning(ctx))
 }
 
 // cockroachEntryPoint returns the value to be used as

From 551c2e1affe4ef1db2f98cb43e0fe3adf8e9db44 Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 25 Aug 2022 11:50:25 +0000
Subject: [PATCH 5/7] acceptance: remove the volume ctn after the node 
 containers

(Since the latter depend on the former)

Also, this preserves the Nodes slice, so it can be inspected
with a debugger.

Release justification: non-production code changes

Release note: None
---
 pkg/acceptance/cluster/dockercluster.go | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go
index f17149330ad3..dc1ab9330953 100644
--- a/pkg/acceptance/cluster/dockercluster.go
+++ b/pkg/acceptance/cluster/dockercluster.go
@@ -192,7 +192,7 @@ func CreateDocker(
 
 func (l *DockerCluster) expectEvent(c *Container, msgs ...string) {
 	for index, ctr := range l.Nodes {
-		if c.id != ctr.id {
+		if ctr.Container == nil || c.id != ctr.id {
 			continue
 		}
 		for _, status := range msgs {
@@ -724,11 +724,6 @@ func (l *DockerCluster) stop(ctx context.Context) {
 		<-l.monitorDone
 	}
 
-	if l.vols != nil {
-		maybePanic(l.vols.Kill(ctx))
-		maybePanic(l.vols.Remove(ctx))
-		l.vols = nil
-	}
 	for i, n := range l.Nodes {
 		if n.Container == nil {
 			continue
@@ -751,8 +746,14 @@ func (l *DockerCluster) stop(ctx context.Context) {
 			log.Infof(ctx, "~~~ node %d CRASHED ~~~~", i)
 		}
 		maybePanic(n.Remove(ctx))
+		n.Container = nil
+	}
+
+	if l.vols != nil {
+		maybePanic(l.vols.Kill(ctx))
+		maybePanic(l.vols.Remove(ctx))
+		l.vols = nil
 	}
-	l.Nodes = nil
 
 	if l.networkID != "" {
 		maybePanic(

From 032539cc28bc00735a4e607f5d0a96cb7e3d64f7 Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 25 Aug 2022 12:21:51 +0000
Subject: [PATCH 6/7] acceptance: ensure the vol directory is only deleted
 after shutdown

Prior to this patch, the docker volume directory was deleted before
the cluster nodes were shut down, which could cause them to crash
abnormally and have the Assert method return an unexpected failure,
even when the test would otherwise succeed.

This patch fixes it.

Release justification: non-production code changes

Release note: None
---
 pkg/acceptance/cluster/dockercluster.go |  1 +
 pkg/acceptance/util_docker.go           | 22 +++++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go
index dc1ab9330953..45f77362bf58 100644
--- a/pkg/acceptance/cluster/dockercluster.go
+++ b/pkg/acceptance/cluster/dockercluster.go
@@ -888,6 +888,7 @@ func (l *DockerCluster) Cleanup(ctx context.Context, preserveLogs bool) {
 	}
 	for _, v := range volumes {
 		if preserveLogs && v.Name() == "logs" {
+			log.Infof(ctx, "preserving log directory: %s", l.volumesDir)
 			continue
 		}
 		if err := os.RemoveAll(filepath.Join(l.volumesDir, v.Name())); err != nil {
diff --git a/pkg/acceptance/util_docker.go b/pkg/acceptance/util_docker.go
index a0f332840502..8104e2f59a14 100644
--- a/pkg/acceptance/util_docker.go
+++ b/pkg/acceptance/util_docker.go
@@ -22,6 +22,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/base"
 	"github.com/cockroachdb/cockroach/pkg/build/bazel"
 	"github.com/cockroachdb/cockroach/pkg/security/username"
+	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/containerd/containerd/platforms"
 	"github.com/docker/docker/api/types"
 	"github.com/docker/docker/api/types/container"
@@ -121,18 +122,33 @@ func testDocker(
 			cfg.Nodes = append(cfg.Nodes, cluster.NodeConfig{Stores: []cluster.StoreConfig{{}}})
 		}
 		l := StartCluster(ctx, t, cfg).(*cluster.DockerCluster)
-		defer l.AssertAndStop(ctx, t)
+
+		var preserveLogs bool
+		defer func() {
+			// Check the final health of the cluster nodes and
+			// stop the cluster after that.
+			l.AssertAndStop(ctx, t)
+
+			// Note: we must be careful to clean up the volumes *after*
+			// the cluster has been shut down (in the `AssertAndStop` call).
+			// Otherwise, the directory removal will cause the cluster nodes
+			// to crash and report abnormal termination, even when the test
+			// succeeds otherwise.
+			log.Infof(ctx, "cleaning up docker volume")
+			l.Cleanup(ctx, preserveLogs)
+		}()
 
 		if len(l.Nodes) > 0 {
 			containerConfig.Env = append(containerConfig.Env, "PGHOST="+l.Hostname(0))
 		}
 
+		log.Infof(ctx, "starting one-shot container")
 		err = l.OneShot(
 			ctx, acceptanceImage, types.ImagePullOptions{}, containerConfig, hostConfig,
 			platforms.DefaultSpec(), "docker-"+name,
 		)
-		preserveLogs := err != nil
-		l.Cleanup(ctx, preserveLogs)
+		log.Infof(ctx, "one-shot container terminated: %v", err)
+		preserveLogs = err != nil
 	})
 	return err
 }

From 9b4fd921b97d4cefe20f02cea82ab43adc64093c Mon Sep 17 00:00:00 2001
From: Raphael 'kena' Poss <knz@thaumogen.net>
Date: Thu, 25 Aug 2022 12:23:51 +0000
Subject: [PATCH 7/7] acceptance: avoid races in the health assertion

Prior to this patch, the Assert code was reading events from
the docker event channel too fast, possibly missing some events
in the process.

It is because the events were read too fast that the
problem fixed in the previous commit did not always result
in a test crash/failure (i.e. the assert was too fast to
do its job properly in most cases).

By slowing down the reading of events from docker, we can
surface logic error in the surrounding test framework
more clearly.

Release justification: non-production code changes

Release note: None
---
 pkg/acceptance/cluster/dockercluster.go | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go
index 45f77362bf58..d88a3fa3757c 100644
--- a/pkg/acceptance/cluster/dockercluster.go
+++ b/pkg/acceptance/cluster/dockercluster.go
@@ -669,7 +669,6 @@ func (l *DockerCluster) Start(ctx context.Context) {
 // the cluster (restart, kill, ...). In the event of a mismatch, the passed
 // Tester receives a fatal error.
 func (l *DockerCluster) Assert(ctx context.Context, t testing.TB) {
-	const almostZero = 50 * time.Millisecond
 	filter := func(ch chan Event, wait time.Duration) *Event {
 		select {
 		case act := <-ch:
@@ -681,17 +680,28 @@ func (l *DockerCluster) Assert(ctx context.Context, t testing.TB) {
 
 	var events []Event
 	for {
+		// The expected event channel is buffered and should contain
+		// all expected events already.
+		const almostZero = 15 * time.Millisecond
 		exp := filter(l.expectedEvents, almostZero)
 		if exp == nil {
 			break
 		}
-		act := filter(l.events, 15*time.Second)
+		t.Logf("expecting event: %v", exp)
+		// l.events is connected to the docker controller and may
+		// receive events more slowly.
+		const waitForDockerEvent = 15 * time.Second
+		act := filter(l.events, waitForDockerEvent)
+		t.Logf("got event: %v", act)
 		if act == nil || *exp != *act {
 			t.Fatalf("expected event %v, got %v (after %v)", exp, act, events)
 		}
 		events = append(events, *exp)
 	}
-	if cur := filter(l.events, almostZero); cur != nil {
+	// At the end, we leave docker a bit more time to report a final event,
+	// if any.
+	const waitForLastDockerEvent = 1 * time.Second
+	if cur := filter(l.events, waitForLastDockerEvent); cur != nil {
 		t.Fatalf("unexpected extra event %v (after %v)", cur, events)
 	}
 	if log.V(2) {