From 39f54e90d93f80c926c1a89542a2c006be090c8e Mon Sep 17 00:00:00 2001
From: Kashif Khan <kashif.khan@est.tech>
Date: Thu, 12 Dec 2024 10:15:48 +0200
Subject: [PATCH] Add e2e test for metrics service

Signed-off-by: Kashif Khan <kashif.khan@est.tech>
---
 .golangci.yaml             |   1 +
 config/base/manager.yaml   |  96 +++++++++++++++--------------
 config/render/capm3.yaml   |   3 +
 main.go                    |   4 +-
 test/e2e/e2e_suite_test.go | 123 +++++++++++++++++++++++++++++++++++++
 5 files changed, 180 insertions(+), 47 deletions(-)

diff --git a/.golangci.yaml b/.golangci.yaml
index ec0300a8b0..87266094c9 100644
--- a/.golangci.yaml
+++ b/.golangci.yaml
@@ -120,6 +120,7 @@ issues:
     linters:
     - gci
     - goconst
+    - gosec
   - path: _test\.go
     linters:
     - errcheck
diff --git a/config/base/manager.yaml b/config/base/manager.yaml
index d4a1af4447..22f7fc975f 100644
--- a/config/base/manager.yaml
+++ b/config/base/manager.yaml
@@ -19,52 +19,56 @@ spec:
         webhook: metal3-io-v1alpha1-baremetalhost
     spec:
       containers:
-        - command:
-            - /baremetal-operator
-          args:
-            - --enable-leader-election
-            - --tls-min-version=TLS13
-          image: quay.io/metal3-io/baremetal-operator
-          imagePullPolicy: Always
-          env:
-            - name: POD_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-          envFrom:
-            - configMapRef:
-                name: ironic
-          name: manager
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-                - ALL
-            privileged: false
-            runAsUser: 65532
-            runAsGroup: 65532
-          livenessProbe:
-            httpGet:
-              path: /healthz
-              port: 9440
-            initialDelaySeconds: 10
-            periodSeconds: 10
-            timeoutSeconds: 2
-            successThreshold: 1
-            failureThreshold: 10
-          readinessProbe:
-            httpGet:
-              path: /readyz
-              port: 9440
-            initialDelaySeconds: 10
-            periodSeconds: 10
-            timeoutSeconds: 2
-            successThreshold: 1
-            failureThreshold: 10
+      - command:
+        - /baremetal-operator
+        args:
+        - --enable-leader-election
+        - --tls-min-version=TLS13
+        ports:
+        - containerPort: 8443
+          protocol: TCP
+          name: https
+        image: quay.io/metal3-io/baremetal-operator
+        imagePullPolicy: Always
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        envFrom:
+        - configMapRef:
+            name: ironic
+        name: manager
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+            - ALL
+          privileged: false
+          runAsUser: 65532
+          runAsGroup: 65532
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 9440
+          initialDelaySeconds: 10
+          periodSeconds: 10
+          timeoutSeconds: 2
+          successThreshold: 1
+          failureThreshold: 10
+        readinessProbe:
+          httpGet:
+            path: /readyz
+            port: 9440
+          initialDelaySeconds: 10
+          periodSeconds: 10
+          timeoutSeconds: 2
+          successThreshold: 1
+          failureThreshold: 10
       terminationGracePeriodSeconds: 10
       securityContext:
         runAsNonRoot: true
diff --git a/config/render/capm3.yaml b/config/render/capm3.yaml
index 683a850280..d857a6a003 100644
--- a/config/render/capm3.yaml
+++ b/config/render/capm3.yaml
@@ -2598,6 +2598,9 @@ spec:
         - containerPort: 9443
           name: webhook-server
           protocol: TCP
+        - containerPort: 8443
+          name: https
+          protocol: TCP
         readinessProbe:
           failureThreshold: 10
           httpGet:
diff --git a/main.go b/main.go
index 7186af8d83..69c6043ace 100644
--- a/main.go
+++ b/main.go
@@ -137,7 +137,7 @@ func main() {
 	// namespace.
 	flag.StringVar(&watchNamespace, "namespace", os.Getenv("WATCH_NAMESPACE"),
 		"Namespace that the controller watches to reconcile host resources.")
-	flag.StringVar(&metricsBindAddr, "metrics-addr", "127.0.0.1:8085",
+	flag.StringVar(&metricsBindAddr, "metrics-addr", ":8443",
 		"The address the metric endpoint binds to.")
 	flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
 		"Enable leader election for controller manager. "+
@@ -217,7 +217,9 @@ func main() {
 		Scheme: scheme,
 		Metrics: metricsserver.Options{
 			BindAddress:    metricsBindAddr,
+			SecureServing:  true,
 			FilterProvider: filters.WithAuthenticationAndAuthorization,
+			TLSOpts:        tlsOptionOverrides,
 		},
 		WebhookServer: webhook.NewServer(webhook.Options{
 			Port:    webhookPort,
diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go
index bad02f6021..805b76145f 100644
--- a/test/e2e/e2e_suite_test.go
+++ b/test/e2e/e2e_suite_test.go
@@ -5,11 +5,15 @@ package e2e
 
 import (
 	"context"
+	"encoding/json"
 	"flag"
+	"fmt"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 
 	metal3api "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1"
 	. "github.com/onsi/ginkgo/v2"
@@ -79,6 +83,11 @@ func TestE2e(t *testing.T) {
 	RunSpecs(t, "E2e Suite")
 }
 
+const namespace = "baremetal-operator-system"
+const serviceAccountName = "baremetal-operator-controller-manager"
+const metricsServiceName = "baremetal-operator-controller-manager-metrics-service"
+const metricsRoleBindingName = "baremetal-operator-metrics-binding"
+
 var _ = SynchronizedBeforeSuite(func() []byte {
 	var kubeconfigPath string
 
@@ -161,6 +170,62 @@ var _ = SynchronizedBeforeSuite(func() []byte {
 		Expect(err).NotTo(HaveOccurred())
 	}
 
+	// Metrics test start
+	By("creating a ClusterRoleBinding for the service account to allow access to metrics")
+	cmd := exec.Command("kubectl", "create", "clusterrolebinding", metricsRoleBindingName,
+		"--clusterrole=baremetal-operator-metrics-reader",
+		fmt.Sprintf("--serviceaccount=%s:%s", namespace, serviceAccountName),
+	)
+	_, err := cmd.CombinedOutput()
+	Expect(err).NotTo(HaveOccurred(), "Failed to create ClusterRoleBinding")
+
+	By("validating that the metrics service is available")
+	cmd = exec.Command("kubectl", "get", "service", metricsServiceName, "-n", namespace)
+	_, err = cmd.CombinedOutput()
+	Expect(err).NotTo(HaveOccurred(), "Metrics service should exist")
+
+	By("getting the service account token")
+	token, err := serviceAccountToken()
+	Expect(err).NotTo(HaveOccurred())
+	Expect(token).NotTo(BeEmpty())
+
+	By("waiting for the metrics endpoint to be ready")
+	verifyMetricsEndpointReady := func(g Gomega) {
+		cmd := exec.Command("kubectl", "get", "endpoints", metricsServiceName, "-n", namespace)
+		output, err := cmd.CombinedOutput()
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(output).To(ContainSubstring("8443"), "Metrics endpoint is not ready")
+	}
+	Eventually(verifyMetricsEndpointReady).Should(Succeed())
+
+	By("creating the curl-metrics pod to access the metrics endpoint")
+	cmd = exec.Command("kubectl", "run", "curl-metrics", "--restart=Never",
+		"--namespace", namespace,
+		"--image=curlimages/curl:7.87.0",
+		"--command",
+		"--", "curl", "-v", "--tlsv1.3", "-k", "-H", fmt.Sprintf("Authorization:Bearer %s", token),
+		fmt.Sprintf("https://%s.%s.svc.cluster.local:8443/metrics", metricsServiceName, namespace))
+	_, err = cmd.CombinedOutput()
+	Expect(err).NotTo(HaveOccurred(), "Failed to create curl-metrics pod")
+
+	By("waiting for the curl-metrics pod to complete.")
+	verifyCurlUp := func(g Gomega) {
+		cmd := exec.Command("kubectl", "get", "pods", "curl-metrics",
+			"-o", "jsonpath={.status.phase}",
+			"-n", namespace)
+		output, err := cmd.CombinedOutput()
+		g.Expect(err).NotTo(HaveOccurred())
+		g.Expect(string(output)).To(Equal("Succeeded"), "curl pod in wrong status")
+	}
+	Eventually(verifyCurlUp, 5*time.Minute).Should(Succeed())
+
+	By("getting the metrics by checking curl-metrics logs")
+	metricsOutput := getMetricsOutput()
+	Expect(metricsOutput).To(ContainSubstring(
+		"controller_runtime_reconcile_total",
+	))
+	// Metrics test end
+
 	return []byte(strings.Join([]string{clusterProxy.GetKubeconfigPath()}, ","))
 }, func(data []byte) {
 	// Before each parallel node
@@ -179,6 +244,64 @@ var _ = SynchronizedBeforeSuite(func() []byte {
 	clusterProxy = framework.NewClusterProxy("bmo-e2e", kubeconfigPath, scheme)
 })
 
+// serviceAccountToken returns a token for the specified service account in the given namespace.
+// It uses the Kubernetes TokenRequest API to generate a token by directly sending a request
+// and parsing the resulting token from the API response.
+func serviceAccountToken() (string, error) {
+	const tokenRequestRawString = `{
+		"apiVersion": "authentication.k8s.io/v1",
+		"kind": "TokenRequest"
+	}`
+
+	// Temporary file to store the token request
+	secretName := fmt.Sprintf("%s-token-request", serviceAccountName)
+	tokenRequestFile := filepath.Join("/tmp", secretName) //nolint: gocritic
+	err := os.WriteFile(tokenRequestFile, []byte(tokenRequestRawString), os.FileMode(0o644))
+	if err != nil {
+		return "", err
+	}
+
+	var out string
+	verifyTokenCreation := func(g Gomega) {
+		// Execute kubectl command to create the token
+		cmd := exec.Command("kubectl", "create", "--raw", fmt.Sprintf(
+			"/api/v1/namespaces/%s/serviceaccounts/%s/token",
+			namespace,
+			serviceAccountName,
+		), "-f", tokenRequestFile)
+
+		output, err := cmd.CombinedOutput()
+		g.Expect(err).NotTo(HaveOccurred())
+
+		// Parse the JSON output to extract the token
+		var token tokenRequest
+		err = json.Unmarshal(output, &token)
+		g.Expect(err).NotTo(HaveOccurred())
+
+		out = token.Status.Token
+	}
+	Eventually(verifyTokenCreation).Should(Succeed())
+
+	return out, err
+}
+
+// tokenRequest is a simplified representation of the Kubernetes TokenRequest API response,
+// containing only the token field that we need to extract.
+type tokenRequest struct {
+	Status struct {
+		Token string `json:"token"`
+	} `json:"status"`
+}
+
+// getMetricsOutput retrieves and returns the logs from the curl pod used to access the metrics endpoint.
+func getMetricsOutput() string {
+	By("getting the curl-metrics logs")
+	cmd := exec.Command("kubectl", "logs", "curl-metrics", "-n", namespace)
+	metricsOutput, err := cmd.CombinedOutput()
+	Expect(err).NotTo(HaveOccurred(), "Failed to retrieve logs from curl pod")
+	return string(metricsOutput)
+}
+
 // Using a SynchronizedAfterSuite for controlling how to delete resources shared across ParallelNodes (~ginkgo threads).
 // The kubernetes cluster is shared across all the tests, so it should be deleted only after all ParallelNodes completes.
 // The artifact folder is preserved.