From 800435a4e721e6affe0ede74a666dfedd43fc472 Mon Sep 17 00:00:00 2001 From: Peter Brachwitz Date: Wed, 27 Sep 2023 21:28:55 +0200 Subject: [PATCH] Try to stabilise tests by checking for cluster health early (#7184) Add an immediate health check after creation to delay subsequent k8s resource creations to a point when ES is available. The main motivation is test stability in cases where Beats would send data immediately without waiting for proper ES initialisation see https://github.com/elastic/cloud-on-k8s/issues/7172 Add an opt-out mechanism for tests where we know the health check cannot succeed. TODO: it is unclear at this point why the remote cluster test fails if we wait for cluster 1 to come up first. I will create a follow up issue --- test/e2e/es/forced_upgrade_test.go | 9 ++++++--- test/e2e/es/remote_cluster_test.go | 5 ++++- test/e2e/test/elasticsearch/builder.go | 6 ++++++ test/e2e/test/elasticsearch/steps_creation.go | 8 ++++++-- test/e2e/test/helper/yaml.go | 2 ++ 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/test/e2e/es/forced_upgrade_test.go b/test/e2e/es/forced_upgrade_test.go index ccb40e6531..70e3aa5324 100644 --- a/test/e2e/es/forced_upgrade_test.go +++ b/test/e2e/es/forced_upgrade_test.go @@ -23,7 +23,8 @@ import ( func TestForceUpgradePendingPods(t *testing.T) { // create a cluster whose Pods will stay Pending forever initial := elasticsearch.NewBuilder("force-upgrade-pending"). - WithESMasterDataNodes(3, elasticsearch.DefaultResources) + WithESMasterDataNodes(3, elasticsearch.DefaultResources). + SkipImmediateHealthCheck() initial.Elasticsearch.Spec.NodeSets[0].PodTemplate.Spec.NodeSelector = map[string]string{ "cannot": "be-scheduled", } @@ -55,7 +56,8 @@ func TestForceUpgradePendingPodsInOneStatefulSet(t *testing.T) { Name: "pending", Count: 1, PodTemplate: elasticsearch.ESPodTemplate(elasticsearch.DefaultResources), - }) + }). + SkipImmediateHealthCheck() // make Pods of the 2nds NodeSet pending initial.Elasticsearch.Spec.NodeSets[1].PodTemplate.Spec.NodeSelector = map[string]string{ @@ -121,7 +123,8 @@ func TestForceUpgradeBootloopingPods(t *testing.T) { "masterdata": { "this leads": "to a bootlooping instance", }, - }) + }). + SkipImmediateHealthCheck() // fix that cluster to remove the wrong configuration fixed := initial.WithNoESTopology().WithESMasterDataNodes(3, elasticsearch.DefaultResources) diff --git a/test/e2e/es/remote_cluster_test.go b/test/e2e/es/remote_cluster_test.go index ecbaf11237..c4e14ef064 100644 --- a/test/e2e/es/remote_cluster_test.go +++ b/test/e2e/es/remote_cluster_test.go @@ -40,7 +40,10 @@ func TestRemoteCluster(t *testing.T) { es1Builder := elasticsearch.NewBuilder(name). WithNamespace(ns1). WithESMasterDataNodes(1, elasticsearch.DefaultResources). - WithRestrictedSecurityContext() + WithRestrictedSecurityContext(). + // TODO figure out why this is necessary + SkipImmediateHealthCheck() + es1LicenseTestContext := elasticsearch.NewLicenseTestContext(test.NewK8sClientOrFatal(), es1Builder.Elasticsearch) ns2 := test.Ctx().ManagedNamespace(1) diff --git a/test/e2e/test/elasticsearch/builder.go b/test/e2e/test/elasticsearch/builder.go index f133d496fa..db66118735 100644 --- a/test/e2e/test/elasticsearch/builder.go +++ b/test/e2e/test/elasticsearch/builder.go @@ -57,6 +57,7 @@ type Builder struct { GlobalCA bool mutationToleratedChecksFailureCount int + skipImmediateHealthCheck bool } func (b Builder) DeepCopy() *Builder { @@ -111,6 +112,11 @@ func newBuilder(name, randSuffix string) Builder { WithLabel(run.TestNameLabel, name) } +func (b Builder) SkipImmediateHealthCheck() Builder { + b.skipImmediateHealthCheck = true + return b +} + func (b Builder) WithAnnotation(key, value string) Builder { if b.Elasticsearch.ObjectMeta.Annotations == nil { b.Elasticsearch.ObjectMeta.Annotations = make(map[string]string) diff --git a/test/e2e/test/elasticsearch/steps_creation.go b/test/e2e/test/elasticsearch/steps_creation.go index fa6ee41c01..373b1cce8d 100644 --- a/test/e2e/test/elasticsearch/steps_creation.go +++ b/test/e2e/test/elasticsearch/steps_creation.go @@ -32,8 +32,12 @@ func (b Builder) CreationTestSteps(k *test.K8sClient) test.StepList { if b.Elasticsearch.Spec.Version != createdEs.Spec.Version { return fmt.Errorf("expected version %s but got %s", b.Elasticsearch.Spec.Version, createdEs.Spec.Version) } - // TODO this is incomplete - return nil + if b.skipImmediateHealthCheck { + return nil + } + // check cluster health here already during the create steps to make sure any dependent builders can rely on a ready ES + // this is special casing Elasticsearch a little bit in the interest of test stability see https://github.com/elastic/cloud-on-k8s/issues/7172 + return clusterHealthIs(b, k, esv1.ElasticsearchGreenHealth) }), }, }) diff --git a/test/e2e/test/helper/yaml.go b/test/e2e/test/helper/yaml.go index e93484fd9b..47b5e60be7 100644 --- a/test/e2e/test/helper/yaml.go +++ b/test/e2e/test/helper/yaml.go @@ -123,6 +123,8 @@ func (yd *YAMLDecoder) ToBuilders(reader *bufio.Reader, transform BuilderTransfo builders = append(builders, builder) } + // sort to make sure the ES builders come first to minimise the chance of test failures as in https://github.com/elastic/cloud-on-k8s/issues/7172 + sortBuilders(builders) return builders, nil }