From f059cf08047beffe72bc4667fcf6ebae5aa155a8 Mon Sep 17 00:00:00 2001 From: Michael Pleshakov Date: Wed, 13 Mar 2024 12:18:31 -0400 Subject: [PATCH] Add functional test for product telemetry (#1659) * Add functional test for product telemetry Problem: Ensure product telemetry feature is tested with a functional test Solution: - Add a functional test. - Because it requires a NGF with a custom built, it needs to run with telemetry label. - Enable telemetry test in the pipeline Testing: Ran successfully: - make test TAG=$(whoami) GINKGO_LABEL=telemetry - make test TAG=$(whoami) # telemetry test didn't run as expected, the functional test succeeded Also, the pipeline runs telemetry tests successfully ClOSES - https://github.com/nginxinc/nginx-gateway-fabric/issues/1640 Co-authored-by: Saylor Berman --- .github/workflows/ci.yml | 2 + .github/workflows/conformance.yml | 3 + .github/workflows/functional.yml | 7 +- .goreleaser.yml | 8 +- tests/Makefile | 6 +- tests/README.md | 46 +++-- tests/framework/resourcemanager.go | 93 +++++++++- .../manifests/telemetry/collector-values.yaml | 31 ++++ tests/suite/system_suite_test.go | 24 ++- tests/suite/telemetry_test.go | 166 ++++++++++++++++++ 10 files changed, 357 insertions(+), 29 deletions(-) create mode 100644 tests/suite/manifests/telemetry/collector-values.yaml create mode 100644 tests/suite/telemetry_test.go diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da39507a2..35a39aa9d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -151,6 +151,8 @@ jobs: AZURE_STORAGE_KEY: ${{ secrets.AZURE_STORAGE_KEY }} AZURE_BUCKET_NAME: ${{ secrets.AZURE_BUCKET_NAME }} SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_COMMUNITY }} + TELEMETRY_ENDPOINT: "" # disables sending telemetry + TELEMETRY_ENDPOINT_INSECURE: "false" - name: Cache Artifacts uses: actions/cache@ab5e6d0c87105b4c9c2047343972218f562e4319 # v4.0.1 diff --git a/.github/workflows/conformance.yml b/.github/workflows/conformance.yml index 419d75749..72a445034 100644 --- a/.github/workflows/conformance.yml +++ b/.github/workflows/conformance.yml @@ -84,6 +84,9 @@ jobs: with: version: latest args: build --snapshot --clean + env: + TELEMETRY_ENDPOINT: "" # disables sending telemetry + TELEMETRY_ENDPOINT_INSECURE: "false" - name: Build NGF Docker Image uses: docker/build-push-action@af5a7ed5ba88268d5278f7203fb52cd833f66d6e # v5.2.0 diff --git a/.github/workflows/functional.yml b/.github/workflows/functional.yml index 76959825d..a563c30e8 100644 --- a/.github/workflows/functional.yml +++ b/.github/workflows/functional.yml @@ -73,6 +73,9 @@ jobs: with: version: latest args: build --snapshot --clean + env: + TELEMETRY_ENDPOINT: otel-collector-opentelemetry-collector.collector.svc.cluster.local:4317 + TELEMETRY_ENDPOINT_INSECURE: "true" - name: Build NGF Docker Image uses: docker/build-push-action@4a13e500e55cf31b7a5d59a38ab2040ab0f42f56 # v5.1.0 @@ -116,9 +119,9 @@ jobs: make load-images${{ matrix.nginx-image == 'nginx-plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} working-directory: ./tests - - name: Run functional tests + - name: Run functional telemetry tests run: | ngf_prefix=ghcr.io/nginxinc/nginx-gateway-fabric ngf_tag=${{ steps.ngf-meta.outputs.version }} - make test${{ matrix.nginx-image == 'nginx-plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} + make test${{ matrix.nginx-image == 'nginx-plus' && '-with-plus' || ''}} PREFIX=${ngf_prefix} TAG=${ngf_tag} GINKGO_LABEL=telemetry working-directory: ./tests diff --git a/.goreleaser.yml b/.goreleaser.yml index c629185b0..f0f9e335c 100644 --- a/.goreleaser.yml +++ b/.goreleaser.yml @@ -15,7 +15,13 @@ builds: asmflags: - all=-trimpath={{.Env.GOPATH}} ldflags: - - -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{.Date}} -X main.telemetryReportPeriod=24h -X main.telemetryEndpointInsecure=false + - -s -w + - -X main.version={{.Version}} + - -X main.commit={{.Commit}} + - -X main.date={{.Date}} + - -X main.telemetryReportPeriod=24h + - -X main.telemetryEndpoint={{.Env.TELEMETRY_ENDPOINT}} + - -X main.telemetryEndpointInsecure={{.Env.TELEMETRY_ENDPOINT_INSECURE}} main: ./cmd/gateway/ binary: gateway diff --git a/tests/Makefile b/tests/Makefile index b3da3d96c..2169d7f2a 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -13,6 +13,8 @@ GINKGO_LABEL= GINKGO_FLAGS= NGF_VERSION= CI=false +TELEMETRY_ENDPOINT= +TELEMETRY_ENDPOINT_INSECURE= ifneq ($(GINKGO_LABEL),) override GINKGO_FLAGS += -ginkgo.label-filter "$(GINKGO_LABEL)" @@ -38,11 +40,11 @@ delete-kind-cluster: ## Delete kind cluster .PHONY: build-images build-images: ## Build NGF and NGINX images - cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) build-images + cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) TELEMETRY_ENDPOINT=$(TELEMETRY_ENDPOINT) TELEMETRY_ENDPOINT_INSECURE=$(TELEMETRY_ENDPOINT_INSECURE) build-images .PHONY: build-images-with-plus build-images-with-plus: ## Build NGF and NGINX Plus images - cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) build-images-with-plus + cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) TELEMETRY_ENDPOINT=$(TELEMETRY_ENDPOINT) TELEMETRY_ENDPOINT_INSECURE=$(TELEMETRY_ENDPOINT_INSECURE) build-images-with-plus .PHONY: load-images load-images: ## Load NGF and NGINX images on configured kind cluster diff --git a/tests/README.md b/tests/README.md index 5cfb7e100..21179a5af 100644 --- a/tests/README.md +++ b/tests/README.md @@ -68,20 +68,23 @@ test-with-plus Runs the functional tests for NGF with NGINX Plus **Note:** The following variables are configurable when running the below `make` commands: -| Variable | Default | Description | -| ------------------- | ------------------------------- | -------------------------------------------------------------- | -| TAG | edge | tag for the locally built NGF images | -| PREFIX | nginx-gateway-fabric | prefix for the locally built NGF image | -| NGINX_PREFIX | nginx-gateway-fabric/nginx | prefix for the locally built NGINX image | -| NGINX_PLUS_PREFIX | nginx-gateway-fabric/nginx-plus | prefix for the locally built NGINX Plus image | -| PLUS_ENABLED | false | Flag to indicate if NGINX Plus should be enabled | -| PULL_POLICY | Never | NGF image pull policy | -| GW_API_VERSION | 1.0.0 | version of Gateway API resources to install | -| K8S_VERSION | latest | version of k8s that the tests are run on | -| GW_SERVICE_TYPE | NodePort | type of Service that should be created | -| GW_SVC_GKE_INTERNAL | false | specifies if the LoadBalancer should be a GKE internal service | -| GINKGO_LABEL | "" | name of the ginkgo label that will filter the tests to run | -| GINKGO_FLAGS | "" | other ginkgo flags to pass to the go test command | +| Variable | Default | Description | +|------------------------------|---------------------------------|---------------------------------------------------------------------| +| TAG | edge | tag for the locally built NGF images | +| PREFIX | nginx-gateway-fabric | prefix for the locally built NGF image | +| NGINX_PREFIX | nginx-gateway-fabric/nginx | prefix for the locally built NGINX image | +| NGINX_PLUS_PREFIX | nginx-gateway-fabric/nginx-plus | prefix for the locally built NGINX Plus image | +| PLUS_ENABLED | false | Flag to indicate if NGINX Plus should be enabled | +| PULL_POLICY | Never | NGF image pull policy | +| GW_API_VERSION | 1.0.0 | version of Gateway API resources to install | +| K8S_VERSION | latest | version of k8s that the tests are run on | +| GW_SERVICE_TYPE | NodePort | type of Service that should be created | +| GW_SVC_GKE_INTERNAL | false | specifies if the LoadBalancer should be a GKE internal service | +| GINKGO_LABEL | "" | name of the ginkgo label that will filter the tests to run | +| GINKGO_FLAGS | "" | other ginkgo flags to pass to the go test command | +| TELEMETRY_ENDPOINT | Set in the main Makefile | The endpoint to which telemetry reports are sent | +| TELEMETRY_ENDPOINT_INSECURE= | Set in the main Makefile | Controls whether TLS should be used when sending telemetry reports. | + ## Step 1 - Create a Kubernetes cluster @@ -137,6 +140,12 @@ Or, to build NGF with NGINX Plus enabled (NGINX Plus cert and key must exist in make build-images-with-plus load-images-with-plus TAG=$(whoami) ``` +For the telemetry test, which requires a OTel collector, build an image with the following variables set: + +```makefile +TELEMETRY_ENDPOINT=otel-collector-opentelemetry-collector.collector.svc.cluster.local:4317 TELEMETRY_ENDPOINT_INSECURE=true +``` + ## Step 3 - Run the tests ### 3a - Run the functional tests locally @@ -151,6 +160,15 @@ Or, to run the tests with NGINX Plus enabled: make test TAG=$(whoami) PLUS_ENABLED=true ``` +> The command above doesn't run the telemetry functional test, which requires a dedicated invocation because it uses a +> specially built image (see above) and it needs to deploy NGF differently from the rest of functional tests. + +To run the telemetry test: + +```makefile +make test TAG=$(whoami) GINKGO_LABEL=telemetry +``` + ### 3b - Run the tests on a GKE cluster from a GCP VM This step only applies if you are running the NFR tests, or would like to run the functional tests on a GKE cluster from a GCP based VM. diff --git a/tests/framework/resourcemanager.go b/tests/framework/resourcemanager.go index 72c3992cb..e62cb5db8 100644 --- a/tests/framework/resourcemanager.go +++ b/tests/framework/resourcemanager.go @@ -30,6 +30,7 @@ import ( "strings" "time" + apps "k8s.io/api/apps/v1" core "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -37,20 +38,24 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" v1 "sigs.k8s.io/gateway-api/apis/v1" ) // ResourceManager handles creating/updating/deleting Kubernetes resources. type ResourceManager struct { - K8sClient client.Client - FS embed.FS - TimeoutConfig TimeoutConfig + K8sClient client.Client + ClientGoClient kubernetes.Interface // used when k8sClient is not enough + FS embed.FS + TimeoutConfig TimeoutConfig } // ClusterInfo holds the cluster metadata type ClusterInfo struct { - K8sVersion string + K8sVersion string + // ID is the UID of kube-system namespace + ID string MemoryPerNode string GkeInstanceType string GkeZone string @@ -406,9 +411,89 @@ func (rm *ResourceManager) GetClusterInfo() (ClusterInfo, error) { ci.GkeZone = node.Labels["topology.kubernetes.io/zone"] } + var ns core.Namespace + key := types.NamespacedName{Name: "kube-system"} + + if err := rm.K8sClient.Get(ctx, key, &ns); err != nil { + return *ci, fmt.Errorf("error getting kube-system namespace: %w", err) + } + + ci.ID = string(ns.UID) + return *ci, nil } +// GetPodNames returns the names of all Pods in the specified namespace that match the given labels. +func (rm *ResourceManager) GetPodNames(namespace string, labels client.MatchingLabels) ([]string, error) { + ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout) + defer cancel() + + var podList core.PodList + if err := rm.K8sClient.List( + ctx, + &podList, + client.InNamespace(namespace), + labels, + ); err != nil { + return nil, fmt.Errorf("error getting list of Pods: %w", err) + } + + names := make([]string, 0, len(podList.Items)) + + for _, pod := range podList.Items { + names = append(names, pod.Name) + } + + return names, nil +} + +// GetPodLogs returns the logs from the specified Pod +func (rm *ResourceManager) GetPodLogs(namespace, name string, opts *core.PodLogOptions) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout) + defer cancel() + + req := rm.ClientGoClient.CoreV1().Pods(namespace).GetLogs(name, opts) + + logs, err := req.Stream(ctx) + if err != nil { + return "", fmt.Errorf("error getting logs from Pod: %w", err) + } + defer logs.Close() + + buf := new(bytes.Buffer) + if _, err := buf.ReadFrom(logs); err != nil { + return "", fmt.Errorf("error reading logs from Pod: %w", err) + } + + return buf.String(), nil +} + +// GetNGFDeployment returns the NGF Deployment in the specified namespace with the given release name. +func (rm *ResourceManager) GetNGFDeployment(namespace, releaseName string) (*apps.Deployment, error) { + ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout) + defer cancel() + + var deployments apps.DeploymentList + + if err := rm.K8sClient.List( + ctx, + &deployments, + client.InNamespace(namespace), + client.MatchingLabels{ + "app.kubernetes.io/instance": releaseName, + }, + ); err != nil { + return nil, fmt.Errorf("error getting list of Deployments: %w", err) + } + + if len(deployments.Items) != 1 { + return nil, fmt.Errorf("expected 1 NGF Deployment, got %d", len(deployments.Items)) + } + + deployment := deployments.Items[0] + return &deployment, nil +} + // GetReadyNGFPodNames returns the name(s) of the NGF Pod(s). func GetReadyNGFPodNames( k8sClient client.Client, diff --git a/tests/suite/manifests/telemetry/collector-values.yaml b/tests/suite/manifests/telemetry/collector-values.yaml new file mode 100644 index 000000000..5c2f08d95 --- /dev/null +++ b/tests/suite/manifests/telemetry/collector-values.yaml @@ -0,0 +1,31 @@ +mode: deployment +replicaCount: 1 +config: + exporters: + debug: + verbosity: detailed + logging: {} + extensions: + health_check: {} + memory_ballast: + size_in_percentage: 40 + processors: + batch: {} + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + service: + extensions: + - health_check + pipelines: + traces: + exporters: + - debug + receivers: + - otlp diff --git a/tests/suite/system_suite_test.go b/tests/suite/system_suite_test.go index f758bd98b..e1451fb97 100644 --- a/tests/suite/system_suite_test.go +++ b/tests/suite/system_suite_test.go @@ -21,6 +21,7 @@ import ( k8sRuntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" ctlr "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -103,11 +104,15 @@ func setup(cfg setupConfig, extraInstallArgs ...string) { k8sClient, err = client.New(k8sConfig, options) Expect(err).ToNot(HaveOccurred()) + clientGoClient, err := kubernetes.NewForConfig(k8sConfig) + Expect(err).ToNot(HaveOccurred()) + timeoutConfig = framework.DefaultTimeoutConfig() resourceManager = framework.ResourceManager{ - K8sClient: k8sClient, - FS: manifests, - TimeoutConfig: timeoutConfig, + K8sClient: k8sClient, + ClientGoClient: clientGoClient, + FS: manifests, + TimeoutConfig: timeoutConfig, } clusterInfo, err = resourceManager.GetClusterInfo() @@ -210,18 +215,22 @@ func teardown(relName string) { )).To(Succeed()) } -var _ = BeforeSuite(func() { +func getDefaultSetupCfg() setupConfig { _, file, _, _ := runtime.Caller(0) fileDir := path.Join(path.Dir(file), "../") basepath := filepath.Dir(fileDir) localChartPath = filepath.Join(basepath, "deploy/helm-chart") - cfg := setupConfig{ + return setupConfig{ releaseName: releaseName, chartPath: localChartPath, gwAPIVersion: *gatewayAPIVersion, deploy: true, } +} + +var _ = BeforeSuite(func() { + cfg := getDefaultSetupCfg() labelFilter := GinkgoLabelFilter() cfg.nfr = isNFR(labelFilter) @@ -229,7 +238,10 @@ var _ = BeforeSuite(func() { // Skip deployment if: // - running upgrade test (this test will deploy its own version) // - running longevity teardown (deployment will already exist) - if strings.Contains(labelFilter, "upgrade") || strings.Contains(labelFilter, "longevity-teardown") { + // - running telemetry test (NGF will be deployed as part of the test) + if strings.Contains(labelFilter, "upgrade") || + strings.Contains(labelFilter, "longevity-teardown") || + strings.Contains(labelFilter, "telemetry") { cfg.deploy = false } diff --git a/tests/suite/telemetry_test.go b/tests/suite/telemetry_test.go new file mode 100644 index 000000000..6c62d3148 --- /dev/null +++ b/tests/suite/telemetry_test.go @@ -0,0 +1,166 @@ +package suite + +import ( + "fmt" + "os/exec" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + core "k8s.io/api/core/v1" + crClient "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + collectorNamespace = "collector" + collectorChartReleaseName = "otel-collector" + // FIXME(pleshakov): Find a automated way to keep the version updated here similar to dependabot. + // https://github.com/nginxinc/nginx-gateway-fabric/issues/1665 + collectorChartVersion = "0.73.1" +) + +var _ = Describe("Telemetry test with OTel collector", Label("telemetry"), func() { + BeforeEach(func() { + // Because NGF reports telemetry on start, we need to install the collector first. + + // Install collector + output, err := installCollector() + Expect(err).ToNot(HaveOccurred(), string(output)) + + // Install NGF + // Note: the BeforeSuite call doesn't install NGF for 'telemetry' label + + setup(getDefaultSetupCfg()) + }) + + AfterEach(func() { + output, err := uninstallCollector() + Expect(err).ToNot(HaveOccurred(), string(output)) + }) + + It("sends telemetry", func() { + names, err := resourceManager.GetPodNames( + collectorNamespace, + crClient.MatchingLabels{ + "app.kubernetes.io/name": "opentelemetry-collector", + }, + ) + + Expect(err).ToNot(HaveOccurred()) + Expect(names).To(HaveLen(1)) + + name := names[0] + + // We assert that all data points were sent + // For some data points, as a sanity check, we assert on sent values. + + info, err := resourceManager.GetClusterInfo() + Expect(err).ToNot(HaveOccurred()) + + ngfDeployment, err := resourceManager.GetNGFDeployment(ngfNamespace, releaseName) + Expect(err).ToNot(HaveOccurred()) + + matchFirstExpectedLine := func() bool { + logs, err := resourceManager.GetPodLogs(collectorNamespace, name, &core.PodLogOptions{}) + Expect(err).ToNot(HaveOccurred()) + return strings.Contains(logs, "dataType: Str(ngf-product-telemetry)") + } + + // Wait until the collector has received the telemetry data + Eventually(matchFirstExpectedLine, "30s", "5s").Should(BeTrue()) + + logs, err := resourceManager.GetPodLogs(collectorNamespace, name, &core.PodLogOptions{}) + Expect(err).ToNot(HaveOccurred()) + + assertConsecutiveLinesInLogs( + logs, + []string{ + "ImageSource:", + "ProjectName: Str(NGF)", + "ProjectVersion:", + "ProjectArchitecture:", + fmt.Sprintf("ClusterID: Str(%s)", info.ID), + "ClusterVersion:", + "ClusterPlatform:", + fmt.Sprintf("InstallationID: Str(%s)", ngfDeployment.UID), + fmt.Sprintf("ClusterNodeCount: Int(%d)", info.NodeCount), + "FlagNames: Slice", + "FlagValues: Slice", + "GatewayCount: Int(0)", + "GatewayClassCount: Int(1)", + "HTTPRouteCount: Int(0)", + "SecretCount: Int(0)", + "ServiceCount: Int(0)", + "EndpointCount: Int(0)", + "NGFReplicaCount: Int(1)", + }, + ) + }) +}) + +func installCollector() ([]byte, error) { + repoAddArgs := []string{ + "repo", + "add", + "open-telemetry", + "https://open-telemetry.github.io/opentelemetry-helm-charts", + } + + if output, err := exec.Command("helm", repoAddArgs...).CombinedOutput(); err != nil { + return output, err + } + + args := []string{ + "install", + collectorChartReleaseName, + "open-telemetry/opentelemetry-collector", + "--create-namespace", + "--namespace", collectorNamespace, + "--version", collectorChartVersion, + "-f", "manifests/telemetry/collector-values.yaml", + "--wait", + } + + return exec.Command("helm", args...).CombinedOutput() +} + +func uninstallCollector() ([]byte, error) { + args := []string{ + "uninstall", collectorChartReleaseName, + "--namespace", collectorNamespace, + } + + return exec.Command("helm", args...).CombinedOutput() +} + +func assertConsecutiveLinesInLogs(logs string, expectedLines []string) { + lines := strings.Split(logs, "\n") + + // find first expected line in lines + + i := 0 + + for ; i < len(lines); i++ { + if strings.Contains(lines[i], expectedLines[0]) { + i++ + break + } + } + + if i == len(lines) { + Fail(fmt.Sprintf("Expected first line not found: %s, \n%s", expectedLines[0], logs)) + } + + linesLeft := len(lines) - i + expectedLinesLeft := len(expectedLines) - 1 + + if linesLeft < expectedLinesLeft { + format := "Not enough lines remains in the logs, expected %d, got %d\n%s" + Fail(fmt.Sprintf(format, linesLeft, expectedLinesLeft, logs)) + } + + for j := 1; j < len(expectedLines); j++ { + Expect(lines[i]).To(ContainSubstring(expectedLines[j])) + i++ + } +}