diff --git a/changelog/v1.18.0-rc3/span-name-flake.yaml b/changelog/v1.18.0-rc3/span-name-flake.yaml new file mode 100644 index 00000000000..34ed464a749 --- /dev/null +++ b/changelog/v1.18.0-rc3/span-name-flake.yaml @@ -0,0 +1,17 @@ +changelog: + - type: NON_USER_FACING + issueLink: https://github.com/solo-io/gloo/issues/10365 + resolvesIssue: false + description: >- + Remove the chance of a flake occurring due to an UpstreamNotFound error + - type: NON_USER_FACING + issueLink: https://github.com/k8sgateway/k8sgateway/issues/10327 + resolvesIssue: false + description: >- + Remove the chance of a flake occurring due to an UpstreamNotFound error + - type: NON_USER_FACING + issueLink: https://github.com/k8sgateway/k8sgateway/issues/10293 + resolvesIssue: false + description: >- + Mask a product UX issue by enabling some retry logic in a test. The attached + issue should resolve the bad UX -and- remove the retry logic introduced here. \ No newline at end of file diff --git a/test/kubernetes/e2e/debugging.md b/test/kubernetes/e2e/debugging.md index e5048fc456b..3b13aebdc0e 100644 --- a/test/kubernetes/e2e/debugging.md +++ b/test/kubernetes/e2e/debugging.md @@ -8,7 +8,34 @@ The entry point for an e2e test is a Go test function of the form `func TestXyz( Each feature suite is invoked as a subtest of the top level suite. The subtests use [testify](https://github.com/stretchr/testify) to structure the tests in the feature's test suite and make use of the library's assertions. -## Workflows +## Step 1: Setting Up A Cluster +### Using a previously released version +It is possible to run these tests against a previously released version of Gloo Gateway. This is useful for testing a release candidate, or a nightly build. + +There is no setup required for this option, as the test suite will download the helm chart archive and `glooctl` binary from the specified release. You will use the `RELEASED_VERSION` environment variable when running the tests. See the [variable definition](/test/testutils/env.go) for more details. + +### Using a locally built version +For these tests to run, we require the following conditions: +- Gloo Gateway Helm chart archive is present in the `_test` folder, +- `glooctl` is built in the `_output` folder +- A KinD cluster is set up and loaded with the images to be installed by the helm chart + +[ci/kind/setup-kind.sh](/ci/kind/setup-kind.sh) gets run in CI to setup the test environment for the above requirements. +It accepts a number of environment variables, to control the creation of a kind cluster and deployment of Gloo resources to that kind cluster. Please refer to the script itself to see what variables are available. + +Example: +```bash +CLUSTER_NAME=solo-test-cluster CLUSTER_NODE_VERSION=v1.30.0 VERSION=v1.0.0-solo-test ci/kind/setup-kind.sh +``` + +## Step 2: Running Tests +_To run the regression tests, your kubeconfig file must point to a running Kubernetes cluster:_ +``` +kubectl config current-context` +``` +_should run `kind-`_ + +> Note: If you are running tests against a previously released version, you must set RELEASED_VERSION when invoking the tests ### Running a single feature's suite diff --git a/test/kubernetes/e2e/features/tracing/suite.go b/test/kubernetes/e2e/features/tracing/suite.go index 174f594ca68..1986f5ba697 100644 --- a/test/kubernetes/e2e/features/tracing/suite.go +++ b/test/kubernetes/e2e/features/tracing/suite.go @@ -75,6 +75,20 @@ func (s *testingSuite) SetupSuite() { LabelSelector: "app.kubernetes.io/name=http-echo", }, ) + + // Previously, we would create/delete the Service for each test. However, this would occasionally lead to: + // * Hostname gateway-proxy-tracing.gloo-gateway-edge-test.svc.cluster.local was found in DNS cache + //* Trying 10.96.181.139:18080... + //* Connection timed out after 3001 milliseconds + // + // The suspicion is that the rotation of the Service meant that the DNS cache became out of date, + // and we would curl the old IP. + // The workaround to that is to create the service just once at the beginning of the suite. + // This mirrors how Services are typically managed in Gloo Gateway, where they are tied + // to an installation, and not dynamically updated + err = s.testInstallation.Actions.Kubectl().ApplyFile(s.ctx, gatewayProxyServiceManifest, + "-n", s.testInstallation.Metadata.InstallNamespace) + s.NoError(err, "can apply service/gateway-proxy-tracing") } func (s *testingSuite) TearDownSuite() { @@ -85,6 +99,10 @@ func (s *testingSuite) TearDownSuite() { err = s.testInstallation.Actions.Kubectl().DeleteFile(s.ctx, testdefaults.HttpEchoPodManifest) s.Assertions.NoError(err, "can delete echo server") + + err = s.testInstallation.Actions.Kubectl().DeleteFile(s.ctx, gatewayProxyServiceManifest, + "-n", s.testInstallation.Metadata.InstallNamespace) + s.NoError(err, "can delete service/gateway-proxy-tracing") } func (s *testingSuite) BeforeTest(string, string) { @@ -98,8 +116,19 @@ func (s *testingSuite) BeforeTest(string, string) { otelcolSelector, ) - err = s.testInstallation.Actions.Kubectl().ApplyFile(s.ctx, tracingConfigManifest) - s.NoError(err, "can apply gloo tracing resources") + // Technical Debt!! + // https://github.com/k8sgateway/k8sgateway/issues/10293 + // There is a bug in the Control Plane that results in an Error reported on the status + // when the Upstream of the Tracing Collector is not found. This results in the VirtualService + // that references that Upstream being rejected. What should occur is a Warning is reported, + // and the resource is accepted since validation.allowWarnings=true is set. + // We have plans to fix this in the code itself. But for a short-term solution, to reduce the + // noise in CI/CD of this test flaking, we perform some simple retry logic here. + s.EventuallyWithT(func(c *assert.CollectT) { + err = s.testInstallation.Actions.Kubectl().ApplyFile(s.ctx, tracingConfigManifest) + assert.NoError(c, err, "can apply gloo tracing resources") + }, time.Second*5, time.Second*1, "can apply tracing resources") + // accept the upstream // Upstreams no longer report status if they have not been translated at all to avoid conflicting with // other syncers that have translated them, so we can only detect that the objects exist here @@ -109,6 +138,7 @@ func (s *testingSuite) BeforeTest(string, string) { otelcolUpstream.Namespace, otelcolUpstream.Name, clients.ReadOpts{Ctx: s.ctx}) }, ) + // accept the virtual service s.testInstallation.Assertions.EventuallyResourceStatusMatchesState( func() (resources.InputResource, error) { @@ -142,7 +172,7 @@ func (s *testingSuite) AfterTest(string, string) { err = s.testInstallation.Actions.Kubectl().DeleteFile(s.ctx, gatewayConfigManifest, "-n", s.testInstallation.Metadata.InstallNamespace) - s.Assertions.NoError(err, "can delete gloo tracing config") + s.Assertions.NoError(err, "can delete gateway config") } func (s *testingSuite) TestSpanNameTransformationsWithoutRouteDecorator() { @@ -156,6 +186,10 @@ func (s *testingSuite) TestSpanNameTransformationsWithoutRouteDecorator() { curl.WithHostHeader(testHostname), curl.WithPort(gatewayProxyPort), curl.WithPath(pathWithoutRouteDescriptor), + // We are asserting that a request is consistent. To prevent flakes with that assertion, + // we should have some basic retries built into the request + curl.WithRetryConnectionRefused(true), + curl.WithRetries(3, 0, 10), curl.Silent(), }, &matchers.HttpResponse{ @@ -183,6 +217,10 @@ func (s *testingSuite) TestSpanNameTransformationsWithRouteDecorator() { curl.WithHostHeader("example.com"), curl.WithPort(gatewayProxyPort), curl.WithPath(pathWithRouteDescriptor), + // We are asserting that a request is consistent. To prevent flakes with that assertion, + // we should have some basic retries built into the request + curl.WithRetryConnectionRefused(true), + curl.WithRetries(3, 0, 10), curl.Silent(), }, &matchers.HttpResponse{ diff --git a/test/kubernetes/e2e/features/tracing/testdata/gateway.yaml b/test/kubernetes/e2e/features/tracing/testdata/gateway.yaml index 7a04283601e..8ff7704c567 100644 --- a/test/kubernetes/e2e/features/tracing/testdata/gateway.yaml +++ b/test/kubernetes/e2e/features/tracing/testdata/gateway.yaml @@ -23,20 +23,4 @@ spec: collectorUpstreamRef: name: opentelemetry-collector namespace: default ---- -apiVersion: v1 -kind: Service -metadata: - name: gateway-proxy-tracing - labels: - app.kubernetes.io/name: gateway-proxy-tracing-service -spec: - ports: - - name: gateway-proxy-tracing - port: 18080 - protocol: TCP - targetPort: 18080 - selector: - gateway-proxy: live - gateway-proxy-id: gateway-proxy diff --git a/test/kubernetes/e2e/features/tracing/testdata/gw-proxy-tracing-service.yaml b/test/kubernetes/e2e/features/tracing/testdata/gw-proxy-tracing-service.yaml new file mode 100644 index 00000000000..cc00e679877 --- /dev/null +++ b/test/kubernetes/e2e/features/tracing/testdata/gw-proxy-tracing-service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: gateway-proxy-tracing + labels: + app.kubernetes.io/name: gateway-proxy-tracing-service +spec: + type: LoadBalancer + ports: + # This service exposes the Port 18080, used by the Gateway defined in ./gateway.yaml + - name: gateway-proxy-tracing + port: 18080 + protocol: TCP + targetPort: 18080 + # This selector is meant to match the Selector of the deployed gateway-proxy Service + # We intend to route traffic to the gateway-proxy pod(s) that are deployed at install time + selector: + gateway-proxy-id: gateway-proxy + gateway-proxy: live +--- \ No newline at end of file diff --git a/test/kubernetes/e2e/features/tracing/types.go b/test/kubernetes/e2e/features/tracing/types.go index 1ea872ed746..0330a1f2b81 100644 --- a/test/kubernetes/e2e/features/tracing/types.go +++ b/test/kubernetes/e2e/features/tracing/types.go @@ -17,9 +17,10 @@ const ( ) var ( - setupOtelcolManifest = filepath.Join(util.MustGetThisDir(), "testdata", "setup-otelcol.yaml") - tracingConfigManifest = filepath.Join(util.MustGetThisDir(), "testdata", "tracing.yaml") - gatewayConfigManifest = filepath.Join(util.MustGetThisDir(), "testdata", "gateway.yaml") + setupOtelcolManifest = filepath.Join(util.MustGetThisDir(), "testdata", "setup-otelcol.yaml") + tracingConfigManifest = filepath.Join(util.MustGetThisDir(), "testdata", "tracing.yaml") + gatewayConfigManifest = filepath.Join(util.MustGetThisDir(), "testdata", "gateway.yaml") + gatewayProxyServiceManifest = filepath.Join(util.MustGetThisDir(), "testdata", "gw-proxy-tracing-service.yaml") otelcolPod = &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "otel-collector", Namespace: "default"},