diff --git a/tests/Makefile b/tests/Makefile index 2eb9203426..947fe6812e 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -107,7 +107,7 @@ test: ## Runs the functional tests on your default k8s cluster --label-filter "functional" $(GINKGO_FLAGS) ./suite -- \ --gateway-api-version=$(GW_API_VERSION) --gateway-api-prev-version=$(GW_API_PREV_VERSION) \ --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ - --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) \ + --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \ --pull-policy=$(PULL_POLICY) --k8s-version=$(K8S_VERSION) --service-type=$(GW_SERVICE_TYPE) \ --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) diff --git a/tests/scale/generate_manifests.go b/tests/framework/generate_manifests.go similarity index 66% rename from tests/scale/generate_manifests.go rename to tests/framework/generate_manifests.go index 88f752141d..46c7b92b32 100644 --- a/tests/scale/generate_manifests.go +++ b/tests/framework/generate_manifests.go @@ -1,17 +1,18 @@ -//go:build scale -// +build scale - -package scale +package framework import ( "bytes" + "errors" "fmt" - "os" - "path/filepath" + "io" "text/template" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/util/yaml" + "sigs.k8s.io/controller-runtime/pkg/client" ) -var gwTmplTxt = `apiVersion: gateway.networking.k8s.io/v1 +const gwTmplTxt = `apiVersion: gateway.networking.k8s.io/v1 kind: Gateway metadata: name: gateway @@ -33,7 +34,7 @@ spec: {{- end -}} {{- end -}}` -var hrTmplTxt = `apiVersion: gateway.networking.k8s.io/v1 +const hrTmplTxt = `apiVersion: gateway.networking.k8s.io/v1 kind: HTTPRoute metadata: name: {{ .Name }} @@ -53,7 +54,7 @@ spec: port: 80` // nolint:all -var secretTmplTxt = `apiVersion: v1 +const secretTmplTxt = `apiVersion: v1 kind: Secret metadata: name: {{ . }} @@ -63,8 +64,7 @@ data: tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2UUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktjd2dnU2pBZ0VBQW9JQkFRQzZtTnJSdUZ2WXZoSE4KbXI3c1FvNUtKSUVDN3N6TFVrNExFeklSNS9yMEVaUjQ2RnRTaGJQd0ZuaXAwMFBxekhpVkhKYy92TjdkQTVLeApQS1VmdFJuQ1J6YldVaTZBZzJpRU93bXF6WUhGbVNpZkFlVjk0RlAxOGtSbjl1ckV3OEpiRXJIUncrVW51L25tCmFMRHF1eGpFTVBweGhuRklCSnYwK1R3djNEVGx6TjNwUlV6dnpidGZvZCtEVTZBSmR6N3Rid1dTNmR6MHc1Z2kKbW9RelZnbFpnVDBJek9FZkV3NVpWMnRMZllHZWRlRVJ1VjhtR041c09va3R2aGxsMU1udHRaMkZNVHgySmVjUQo3K0xBRm9YVnBTS2NjbUFVZ1JBM0xOOHdVZXBVTHZZdFhiUm1QTFc4SjFINmhFeHJHTHBiTERZNmpzbGxBNlZpCk0xMjVjU0hsQWdNQkFBRUNnZ0VBQnpaRE50bmVTdWxGdk9HZlFYaHRFWGFKdWZoSzJBenRVVVpEcUNlRUxvekQKWlV6dHdxbkNRNlJLczUyandWNTN4cU9kUU94bTNMbjNvSHdNa2NZcEliWW82MjJ2dUczYnkwaVEzaFlsVHVMVgpqQmZCcS9UUXFlL2NMdngvSkczQWhFNmJxdFRjZFlXeGFmTmY2eUtpR1dzZk11WVVXTWs4MGVJVUxuRmZaZ1pOCklYNTlSOHlqdE9CVm9Sa3hjYTVoMW1ZTDFsSlJNM3ZqVHNHTHFybmpOTjNBdWZ3ZGRpK1VDbGZVL2l0K1EvZkUKV216aFFoTlRpNVFkRWJLVStOTnYvNnYvb2JvandNb25HVVBCdEFTUE05cmxFemIralQ1WHdWQjgvLzRGY3VoSwoyVzNpcjhtNHVlQ1JHSVlrbGxlLzhuQmZ0eVhiVkNocVRyZFBlaGlPM1FLQmdRRGlrR3JTOTc3cjg3Y1JPOCtQClpoeXltNXo4NVIzTHVVbFNTazJiOTI1QlhvakpZL2RRZDVTdFVsSWE4OUZKZnNWc1JRcEhHaTFCYzBMaTY1YjIKazR0cE5xcVFoUmZ1UVh0UG9GYXRuQzlPRnJVTXJXbDVJN0ZFejZnNkNQMVBXMEg5d2hPemFKZUdpZVpNYjlYTQoybDdSSFZOcC9jTDlYbmhNMnN0Q1lua2Iwd0tCZ1FEUzF4K0crakEyUVNtRVFWNXA1RnRONGcyamsyZEFjMEhNClRIQ2tTazFDRjhkR0Z2UWtsWm5ZbUt0dXFYeXNtekJGcnZKdmt2eUhqbUNYYTducXlpajBEdDZtODViN3BGcVAKQWxtajdtbXI3Z1pUeG1ZMXBhRWFLMXY4SDNINGtRNVl3MWdrTWRybVJHcVAvaTBGaDVpaGtSZS9DOUtGTFVkSQpDcnJjTzhkUVp3S0JnSHA1MzRXVWNCMVZibzFlYStIMUxXWlFRUmxsTWlwRFM2TzBqeWZWSmtFb1BZSEJESnp2ClIrdzZLREJ4eFoyWmJsZ05LblV0YlhHSVFZd3lGelhNcFB5SGxNVHpiZkJhYmJLcDFyR2JVT2RCMXpXM09PRkgKcmppb21TUm1YNmxhaDk0SjRHU0lFZ0drNGw1SHhxZ3JGRDZ2UDd4NGRjUktJWFpLZ0w2dVJSSUpBb0dCQU1CVApaL2p5WStRNTBLdEtEZHUrYU9ORW4zaGxUN3hrNXRKN3NBek5rbWdGMU10RXlQUk9Xd1pQVGFJbWpRbk9qbHdpCldCZ2JGcXg0M2ZlQ1Z4ZXJ6V3ZEM0txaWJVbWpCTkNMTGtYeGh3ZEVteFQwVit2NzZGYzgwaTNNYVdSNnZZR08KditwVVovL0F6UXdJcWZ6dlVmV2ZxdStrMHlhVXhQOGNlcFBIRyt0bEFvR0FmQUtVVWhqeFU0Ym5vVzVwVUhKegpwWWZXZXZ5TW54NWZyT2VsSmRmNzlvNGMvMHhVSjh1eFBFWDFkRmNrZW96dHNpaVFTNkN6MENRY09XVWxtSkRwCnVrdERvVzM3VmNSQU1BVjY3NlgxQVZlM0UwNm5aL2g2Tkd4Z28rT042Q3pwL0lkMkJPUm9IMFAxa2RjY1NLT3kKMUtFZlNnb1B0c1N1eEpBZXdUZmxDMXc9Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K ` -var appTmplTxt = `apiVersion: v1 -apiVersion: apps/v1 +const appTmplTxt = `apiVersion: apps/v1 kind: Deployment metadata: name: {{ . }} @@ -105,25 +105,55 @@ var ( appTmpl = template.Must(template.New("app").Parse(appTmplTxt)) ) -type Listener struct { +type listener struct { Name string HostnamePrefix string SecretName string } -type Route struct { +type route struct { Name string ListenerName string HostnamePrefix string BackendName string } -func getPrereqDirName(manifestDir string) string { - return filepath.Join(manifestDir, "prereqs") +// ScaleObjects contains objects for scale testing. +type ScaleObjects struct { + // BaseObjects contains objects that are common to all scale iterations. + BaseObjects []client.Object + // ScaleIterationGroups contains objects for each scale iteration. + ScaleIterationGroups [][]client.Object } -func generateScaleListenerManifests(numListeners int, manifestDir string, tls bool) error { - listeners := make([]Listener, 0) +func decodeObjects(reader io.Reader) ([]client.Object, error) { + var objects []client.Object + + decoder := yaml.NewYAMLOrJSONDecoder(reader, 4096) + for { + obj := unstructured.Unstructured{} + if err := decoder.Decode(&obj); err != nil { + if errors.Is(err, io.EOF) { + break + } + return nil, fmt.Errorf("error decoding resource: %w", err) + } + + if len(obj.Object) == 0 { + continue + } + + objects = append(objects, &obj) + } + + return objects, nil +} + +// GenerateScaleListenerObjects generates objects for a given number of listeners for the scale test. +func GenerateScaleListenerObjects(numListeners int, tls bool) (ScaleObjects, error) { + var result ScaleObjects + + listeners := make([]listener, 0) backends := make([]string, 0) secrets := make([]string, 0) @@ -138,13 +168,13 @@ func generateScaleListenerManifests(numListeners int, manifestDir string, tls bo secrets = append(secrets, secretName) } - listeners = append(listeners, Listener{ + listeners = append(listeners, listener{ Name: listenerName, HostnamePrefix: hostnamePrefix, SecretName: secretName, }) - route := Route{ + r := route{ Name: fmt.Sprintf("route-%d", i), ListenerName: listenerName, HostnamePrefix: hostnamePrefix, @@ -153,44 +183,57 @@ func generateScaleListenerManifests(numListeners int, manifestDir string, tls bo backends = append(backends, backendName) - if err := generateManifests(manifestDir, i, listeners, []Route{route}); err != nil { - return err + objects, err := generateManifests(listeners, []route{r}) + if err != nil { + return ScaleObjects{}, err } + + result.ScaleIterationGroups = append(result.ScaleIterationGroups, objects) } - if err := generateSecrets(getPrereqDirName(manifestDir), secrets); err != nil { - return err + secretObjects, err := generateSecrets(secrets) + if err != nil { + return ScaleObjects{}, err } - return generateBackendAppManifests(getPrereqDirName(manifestDir), backends) -} + result.BaseObjects = append(result.BaseObjects, secretObjects...) -func generateSecrets(secretsDir string, secrets []string) error { - err := os.Mkdir(secretsDir, 0o750) - if err != nil && !os.IsExist(err) { - return err + backendObjects, err := generateBackendAppObjects(backends) + if err != nil { + return ScaleObjects{}, err } + result.BaseObjects = append(result.BaseObjects, backendObjects...) + + return result, nil +} + +func generateSecrets(secrets []string) ([]client.Object, error) { + objects := make([]client.Object, 0, len(secrets)) + for _, secret := range secrets { var buf bytes.Buffer - if err = secretTmpl.Execute(&buf, secret); err != nil { - return err + if err := secretTmpl.Execute(&buf, secret); err != nil { + return nil, err } - path := filepath.Join(secretsDir, fmt.Sprintf("%s.yaml", secret)) - - fmt.Println("Writing", path) - if err := os.WriteFile(path, buf.Bytes(), 0o600); err != nil { - return err + objs, err := decodeObjects(&buf) + if err != nil { + return nil, err } + + objects = append(objects, objs...) } - return nil + return objects, nil } -func generateScaleHTTPRouteManifests(numRoutes int, manifestDir string) error { - l := Listener{ +// GenerateScaleHTTPRouteObjects generates objects for a given number of routes for the scale test. +func GenerateScaleHTTPRouteObjects(numRoutes int) (ScaleObjects, error) { + var result ScaleObjects + + l := listener{ Name: "listener", HostnamePrefix: "*", } @@ -198,35 +241,43 @@ func generateScaleHTTPRouteManifests(numRoutes int, manifestDir string) error { backendName := "backend" for i := 0; i < numRoutes; i++ { - - route := Route{ + r := route{ Name: fmt.Sprintf("route-%d", i), HostnamePrefix: fmt.Sprintf("%d", i), ListenerName: "listener", BackendName: backendName, } - var listeners []Listener + var listeners []listener if i == 0 { // only generate a Gateway on the first iteration - listeners = []Listener{l} + listeners = []listener{l} } - if err := generateManifests(manifestDir, i, listeners, []Route{route}); err != nil { - return err + objects, err := generateManifests(listeners, []route{r}) + if err != nil { + return ScaleObjects{}, err } + result.ScaleIterationGroups = append(result.ScaleIterationGroups, objects) + } + + backendObjects, err := generateBackendAppObjects([]string{backendName}) + if err != nil { + return ScaleObjects{}, err } - return generateBackendAppManifests(getPrereqDirName(manifestDir), []string{backendName}) + result.BaseObjects = backendObjects + + return result, nil } -func generateManifests(outDir string, version int, listeners []Listener, routes []Route) error { +func generateManifests(listeners []listener, routes []route) ([]client.Object, error) { var buf bytes.Buffer if len(listeners) > 0 { if err := gwTmpl.Execute(&buf, listeners); err != nil { - return err + return nil, err } } @@ -236,42 +287,30 @@ func generateManifests(outDir string, version int, listeners []Listener, routes } if err := hrTmpl.Execute(&buf, r); err != nil { - return err + return nil, err } } - err := os.Mkdir(outDir, 0o750) - if err != nil && !os.IsExist(err) { - return err - } - - filename := fmt.Sprintf("manifest-%d.yaml", version) - path := filepath.Join(outDir, filename) - - fmt.Println("Writing", path) - return os.WriteFile(path, buf.Bytes(), 0o600) + return decodeObjects(&buf) } -func generateBackendAppManifests(outDir string, backends []string) error { - err := os.Mkdir(outDir, 0o750) - if err != nil && !os.IsExist(err) { - return err - } +func generateBackendAppObjects(backends []string) ([]client.Object, error) { + objects := make([]client.Object, 0, 2*len(backends)) for _, backend := range backends { var buf bytes.Buffer - if err = appTmpl.Execute(&buf, backend); err != nil { - return err + if err := appTmpl.Execute(&buf, backend); err != nil { + return nil, err } - path := filepath.Join(outDir, fmt.Sprintf("%s.yaml", backend)) - - fmt.Println("Writing", path) - if err := os.WriteFile(path, buf.Bytes(), 0o600); err != nil { - return err + objs, err := decodeObjects(&buf) + if err != nil { + return nil, err } + + objects = append(objects, objs...) } - return nil + return objects, nil } diff --git a/tests/framework/portforward.go b/tests/framework/portforward.go index 1efc16be3c..e8234fea7c 100644 --- a/tests/framework/portforward.go +++ b/tests/framework/portforward.go @@ -6,22 +6,25 @@ import ( "net/http" "net/url" "path" + "time" + + "log/slog" "k8s.io/client-go/rest" "k8s.io/client-go/tools/portforward" "k8s.io/client-go/transport/spdy" ) -// PortForward starts a port-forward to the specified Pod and returns the local port being forwarded. -func PortForward(config *rest.Config, namespace, podName string, stopCh chan struct{}) (int, error) { +// PortForward starts a port-forward to the specified Pod. +func PortForward(config *rest.Config, namespace, podName string, ports []string, stopCh <-chan struct{}) error { roundTripper, upgrader, err := spdy.RoundTripperFor(config) if err != nil { - return 0, fmt.Errorf("error creating roundtripper: %w", err) + return fmt.Errorf("error creating roundtripper: %w", err) } serverURL, err := url.Parse(config.Host) if err != nil { - return 0, fmt.Errorf("error parsing rest config host: %w", err) + return fmt.Errorf("error parsing rest config host: %w", err) } serverURL.Path = path.Join( @@ -33,25 +36,34 @@ func PortForward(config *rest.Config, namespace, podName string, stopCh chan str dialer := spdy.NewDialer(upgrader, &http.Client{Transport: roundTripper}, http.MethodPost, serverURL) - readyCh := make(chan struct{}, 1) out, errOut := new(bytes.Buffer), new(bytes.Buffer) - forwarder, err := portforward.New(dialer, []string{":80"}, stopCh, readyCh, out, errOut) - if err != nil { - return 0, fmt.Errorf("error creating port forwarder: %w", err) + forward := func() error { + readyCh := make(chan struct{}, 1) + + forwarder, err := portforward.New(dialer, ports, stopCh, readyCh, out, errOut) + if err != nil { + return fmt.Errorf("error creating port forwarder: %w", err) + } + + return forwarder.ForwardPorts() } go func() { - if err := forwarder.ForwardPorts(); err != nil { - panic(err) + for { + if err := forward(); err != nil { + slog.Error("error forwarding ports", "error", err) + slog.Info("retrying port forward in 100ms...") + } + + select { + case <-stopCh: + return + case <-time.After(100 * time.Millisecond): + // retrying + } } }() - <-readyCh - ports, err := forwarder.GetPorts() - if err != nil { - return 0, fmt.Errorf("error getting ports being forwarded: %w", err) - } - - return int(ports[0].Local), nil + return nil } diff --git a/tests/framework/prometheus.go b/tests/framework/prometheus.go new file mode 100644 index 0000000000..3afe01521f --- /dev/null +++ b/tests/framework/prometheus.go @@ -0,0 +1,293 @@ +package framework + +import ( + "context" + "encoding/csv" + "errors" + "fmt" + "log/slog" + "os" + "os/exec" + "time" + + "github.com/prometheus/client_golang/api" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + prometheusNamespace = "prom" + prometheusReleaseName = "prom" +) + +var defaultPrometheusQueryTimeout = 2 * time.Second + +// PrometheusConfig is the configuration for installing Prometheus +type PrometheusConfig struct { + // ScrapeInterval is the interval at which Prometheus scrapes metrics. + ScrapeInterval time.Duration + // QueryTimeout is the timeout for Prometheus queries. + // Default is 2s. + QueryTimeout time.Duration +} + +// InstallPrometheus installs Prometheus in the cluster. +// It waits for Prometheus pods to be ready before returning. +func InstallPrometheus( + rm ResourceManager, + cfg PrometheusConfig, +) (PrometheusInstance, error) { + output, err := exec.Command( + "helm", + "repo", + "add", + "prometheus-community", + "https://prometheus-community.github.io/helm-charts", + ).CombinedOutput() + if err != nil { + return PrometheusInstance{}, fmt.Errorf("failed to add Prometheus helm repo: %w; output: %s", err, string(output)) + } + + output, err = exec.Command( + "helm", + "repo", + "update", + ).CombinedOutput() + if err != nil { + return PrometheusInstance{}, fmt.Errorf("failed to update helm repos: %w; output: %s", err, string(output)) + } + + scrapeInterval := fmt.Sprintf("%ds", int(cfg.ScrapeInterval.Seconds())) + + output, err = exec.Command( + "helm", + "install", + prometheusReleaseName, + "prometheus-community/prometheus", + "--create-namespace", + "--namespace", prometheusNamespace, + "--set", fmt.Sprintf("server.global.scrape_interval=%s", scrapeInterval), + "--wait", + ).CombinedOutput() + if err != nil { + return PrometheusInstance{}, fmt.Errorf("failed to install Prometheus: %w; output: %s", err, string(output)) + } + + pods, err := rm.GetPods(prometheusNamespace, client.MatchingLabels{ + "app.kubernetes.io/name": "prometheus", + }) + if err != nil { + return PrometheusInstance{}, fmt.Errorf("failed to get Prometheus pods: %w", err) + } + + if len(pods) != 1 { + return PrometheusInstance{}, fmt.Errorf("expected one Prometheus pod, found %d", len(pods)) + } + + pod := pods[0] + + if pod.Status.PodIP == "" { + return PrometheusInstance{}, errors.New("Prometheus pod has no IP") + } + + var queryTimeout time.Duration + if cfg.QueryTimeout == 0 { + queryTimeout = defaultPrometheusQueryTimeout + } else { + queryTimeout = cfg.QueryTimeout + } + + return PrometheusInstance{ + podIP: pod.Status.PodIP, + podName: pod.Name, + podNamespace: pod.Namespace, + queryTimeout: queryTimeout, + }, nil +} + +// UninstallPrometheus uninstalls Prometheus from the cluster. +func UninstallPrometheus(rm ResourceManager) error { + output, err := exec.Command( + "helm", + "uninstall", + prometheusReleaseName, + "-n", prometheusNamespace, + ).CombinedOutput() + if err != nil { + return fmt.Errorf("failed to uninstall Prometheus: %w; output: %s", err, string(output)) + } + + if err := rm.DeleteNamespace(prometheusNamespace); err != nil { + return fmt.Errorf("failed to delete Prometheus namespace: %w", err) + } + + return nil +} + +const ( + // PrometheusPortForwardPort is the local port that will forward to the Prometheus API. + PrometheusPortForwardPort = 9090 + prometheusAPIPort = 9090 +) + +// PrometheusInstance represents a Prometheus instance in the cluster. +type PrometheusInstance struct { + podIP string + podName string + podNamespace string + portForward bool + queryTimeout time.Duration + + apiClient v1.API +} + +// PortForward starts port forwarding to the Prometheus instance. +func (ins *PrometheusInstance) PortForward(config *rest.Config, stopCh <-chan struct{}) error { + if ins.portForward { + panic("port forwarding already started") + } + + ins.portForward = true + + ports := []string{fmt.Sprintf("%d:%d", PrometheusPortForwardPort, prometheusAPIPort)} + return PortForward(config, ins.podNamespace, ins.podName, ports, stopCh) +} + +func (ins *PrometheusInstance) getAPIClient() (v1.API, error) { + var endpoint string + if ins.portForward { + endpoint = fmt.Sprintf("http://localhost:%d", PrometheusPortForwardPort) + } else { + // on GKE, test runner VM can access the pod directly + endpoint = fmt.Sprintf("http://%s:%d", ins.podIP, prometheusAPIPort) + } + + cfg := api.Config{ + Address: fmt.Sprintf("%s", endpoint), + } + + c, err := api.NewClient(cfg) + if err != nil { + return nil, err + } + + return v1.NewAPI(c), nil +} + +func (ins *PrometheusInstance) ensureAPIClient() error { + if ins.apiClient == nil { + api, err := ins.getAPIClient() + if err != nil { + return fmt.Errorf("failed to get Prometheus API client: %w", err) + } + ins.apiClient = api + } + + return nil +} + +// Query sends a query to Prometheus. +func (ins *PrometheusInstance) Query(query string) (model.Value, error) { + ctx, cancel := context.WithTimeout(context.Background(), ins.queryTimeout) + defer cancel() + + return ins.QueryWithCtx(ctx, query) +} + +// QueryWithCtx sends a query to Prometheus with the specified context. +func (ins *PrometheusInstance) QueryWithCtx(ctx context.Context, query string) (model.Value, error) { + if err := ins.ensureAPIClient(); err != nil { + return nil, err + } + + result, warnings, err := ins.apiClient.Query(ctx, query, time.Time{}) + if err != nil { + return nil, fmt.Errorf("failed to query Prometheus: %w", err) + } + + if len(warnings) > 0 { + slog.Info( + "Prometheus query returned warnings", + "query", query, + "warnings", warnings, + ) + } + + return result, nil +} + +// QueryRange sends a range query to Prometheus. +func (ins *PrometheusInstance) QueryRange(query string, promRange v1.Range) (model.Value, error) { + ctx, cancel := context.WithTimeout(context.Background(), ins.queryTimeout) + defer cancel() + + return ins.QueryRangeWithCtx(ctx, query, promRange) +} + +// QueryRangeWithCtx sends a range query to Prometheus with the specified context. +func (ins *PrometheusInstance) QueryRangeWithCtx(ctx context.Context, query string, promRange v1.Range) (model.Value, error) { + if err := ins.ensureAPIClient(); err != nil { + return nil, err + } + + result, warnings, err := ins.apiClient.QueryRange(ctx, query, promRange) + if err != nil { + return nil, fmt.Errorf("failed to query Prometheus: %w", err) + } + + if len(warnings) > 0 { + slog.Info( + "Prometheus range query returned warnings", + "query", query, + "range", promRange, + "warnings", warnings, + ) + } + + return result, nil +} + +// GetFirstValueOfPrometheusVector returns the first value of a Prometheus vector. +func GetFirstValueOfPrometheusVector(val model.Value) (float64, error) { + res, ok := val.(model.Vector) + if !ok { + return 0, fmt.Errorf("expected a vector, got %T", val) + } + + if len(res) == 0 { + return 0, errors.New("empty vector") + } + + return float64(res[0].Value), nil +} + +// WritePrometheusMatrixToCSVFile writes a Prometheus matrix to a CSV file. +func WritePrometheusMatrixToCSVFile(fileName string, value model.Value) error { + file, err := os.Create(fileName) + if err != nil { + return err + } + defer file.Close() + + csvWriter := csv.NewWriter(file) + + matrix, ok := value.(model.Matrix) + if !ok { + return fmt.Errorf("expected a matrix, got %T", value) + } + + for _, sample := range matrix { + for _, pair := range sample.Values { + record := []string{fmt.Sprint(pair.Timestamp.Unix()), pair.Value.String()} + if err := csvWriter.Write(record); err != nil { + return err + } + } + } + + csvWriter.Flush() + + return nil +} diff --git a/tests/framework/resourcemanager.go b/tests/framework/resourcemanager.go index fdc996c5d6..3cc73d08a8 100644 --- a/tests/framework/resourcemanager.go +++ b/tests/framework/resourcemanager.go @@ -27,9 +27,12 @@ import ( "fmt" "io" "net/http" + "reflect" "strings" "time" + "k8s.io/client-go/util/retry" + apps "k8s.io/api/apps/v1" core "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -71,7 +74,17 @@ func (rm *ResourceManager) Apply(resources []client.Object) error { defer cancel() for _, resource := range resources { - if err := rm.K8sClient.Get(ctx, client.ObjectKeyFromObject(resource), resource); err != nil { + var obj client.Object + + unstructuredObj, ok := resource.(*unstructured.Unstructured) + if ok { + obj = unstructuredObj.DeepCopy() + } else { + t := reflect.TypeOf(resource).Elem() + obj = reflect.New(t).Interface().(client.Object) + } + + if err := rm.K8sClient.Get(ctx, client.ObjectKeyFromObject(resource), obj); err != nil { if !apierrors.IsNotFound(err) { return fmt.Errorf("error getting resource: %w", err) } @@ -83,7 +96,16 @@ func (rm *ResourceManager) Apply(resources []client.Object) error { continue } - if err := rm.K8sClient.Update(ctx, resource); err != nil { + // Some tests modify resources that are also modified by NGF (to update their status), so conflicts are possible + // For example, a Gateway resource. + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := rm.K8sClient.Get(ctx, client.ObjectKeyFromObject(resource), obj); err != nil { + return err + } + resource.SetResourceVersion(obj.GetResourceVersion()) + return rm.K8sClient.Update(ctx, resource) + }) + if err != nil { return fmt.Errorf("error updating resource: %w", err) } } @@ -112,8 +134,16 @@ func (rm *ResourceManager) ApplyFromFiles(files []string, namespace string) erro return nil } - obj.SetResourceVersion(fetchedObj.GetResourceVersion()) - if err := rm.K8sClient.Update(ctx, &obj); err != nil { + // Some tests modify resources that are also modified by NGF (to update their status), so conflicts are possible + // For example, a Gateway resource. + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + if err := rm.K8sClient.Get(ctx, nsName, fetchedObj); err != nil { + return err + } + obj.SetResourceVersion(fetchedObj.GetResourceVersion()) + return rm.K8sClient.Update(ctx, &obj) + }) + if err != nil { return fmt.Errorf("error updating resource: %w", err) } @@ -137,7 +167,39 @@ func (rm *ResourceManager) Delete(resources []client.Object, opts ...client.Dele return nil } -// DeleteFromFile deletes Kubernetes resources defined within the provided YAML files. +func (rm *ResourceManager) DeleteNamespace(name string) error { + ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.DeleteNamespaceTimeout) + defer cancel() + + ns := &core.Namespace{} + if err := rm.K8sClient.Get(ctx, types.NamespacedName{Name: name}, ns); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("error getting namespace: %w", err) + } + + if err := rm.K8sClient.Delete(ctx, ns); err != nil { + return fmt.Errorf("error deleting namespace: %w", err) + } + + // Because the namespace deletion is asynchronous, we need to wait for the namespace to be deleted. + return wait.PollUntilContextCancel( + ctx, + 500*time.Millisecond, + true, /* poll immediately */ + func(ctx context.Context) (bool, error) { + if err := rm.K8sClient.Get(ctx, types.NamespacedName{Name: name}, ns); err != nil { + if apierrors.IsNotFound(err) { + return true, nil + } + return false, fmt.Errorf("error getting namespace: %w", err) + } + return false, nil + }) +} + +// DeleteFromFiles deletes Kubernetes resources defined within the provided YAML files. func (rm *ResourceManager) DeleteFromFiles(files []string, namespace string) error { handlerFunc := func(obj unstructured.Unstructured) error { obj.SetNamespace(namespace) @@ -241,7 +303,13 @@ func (rm *ResourceManager) WaitForAppsToBeReady(namespace string) error { ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.CreateTimeout) defer cancel() - if err := rm.waitForPodsToBeReady(ctx, namespace); err != nil { + return rm.WaitForAppsToBeReadyWithCtx(ctx, namespace) +} + +// WaitForAppsToBeReadyWithCtx waits for all apps in the specified namespace to be ready or +// until the provided context is cancelled. +func (rm *ResourceManager) WaitForAppsToBeReadyWithCtx(ctx context.Context, namespace string) error { + if err := rm.WaitForPodsToBeReady(ctx, namespace); err != nil { return err } @@ -252,7 +320,9 @@ func (rm *ResourceManager) WaitForAppsToBeReady(namespace string) error { return rm.waitForGatewaysToBeReady(ctx, namespace) } -func (rm *ResourceManager) waitForPodsToBeReady(ctx context.Context, namespace string) error { +// WaitForPodsToBeReady waits for all Pods in the specified namespace to be ready or +// until the provided context is cancelled. +func (rm *ResourceManager) WaitForPodsToBeReady(ctx context.Context, namespace string) error { return wait.PollUntilContextCancel( ctx, 500*time.Millisecond, @@ -447,6 +517,37 @@ func (rm *ResourceManager) GetPodNames(namespace string, labels client.MatchingL return names, nil } +// GetPods returns all Pods in the specified namespace that match the given labels. +func (rm *ResourceManager) GetPods(namespace string, labels client.MatchingLabels) ([]core.Pod, error) { + ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout) + defer cancel() + + var podList core.PodList + if err := rm.K8sClient.List( + ctx, + &podList, + client.InNamespace(namespace), + labels, + ); err != nil { + return nil, fmt.Errorf("error getting list of Pods: %w", err) + } + + return podList.Items, nil +} + +// GetPod returns the Pod in the specified namespace with the given name. +func (rm *ResourceManager) GetPod(namespace, name string) (*core.Pod, error) { + ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout) + defer cancel() + + var pod core.Pod + if err := rm.K8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &pod); err != nil { + return nil, fmt.Errorf("error getting Pod: %w", err) + } + + return &pod, nil +} + // GetPodLogs returns the logs from the specified Pod func (rm *ResourceManager) GetPodLogs(namespace, name string, opts *core.PodLogOptions) (string, error) { ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.GetTimeout) @@ -494,6 +595,24 @@ func (rm *ResourceManager) GetNGFDeployment(namespace, releaseName string) (*app return &deployment, nil } +// ScaleDeployment scales the Deployment to the specified number of replicas. +func (rm *ResourceManager) ScaleDeployment(namespace, name string, replicas int32) error { + ctx, cancel := context.WithTimeout(context.Background(), rm.TimeoutConfig.UpdateTimeout) + defer cancel() + + var deployment apps.Deployment + if err := rm.K8sClient.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, &deployment); err != nil { + return fmt.Errorf("error getting Deployment: %w", err) + } + + deployment.Spec.Replicas = &replicas + if err := rm.K8sClient.Update(ctx, &deployment); err != nil { + return fmt.Errorf("error updating Deployment: %w", err) + } + + return nil +} + // GetReadyNGFPodNames returns the name(s) of the NGF Pod(s). func GetReadyNGFPodNames( k8sClient client.Client, diff --git a/tests/framework/results.go b/tests/framework/results.go index 7d5b8ad2ee..87d363b080 100644 --- a/tests/framework/results.go +++ b/tests/framework/results.go @@ -1,6 +1,7 @@ package framework import ( + "encoding/csv" "fmt" "io" "os" @@ -64,24 +65,48 @@ func WriteSystemInfoToFile(file *os.File, ci ClusterInfo, plus bool) error { return nil } -// GeneratePNG generates a PNG using gnuplot. -func GeneratePNG(resultsDir, inputFilename, outputFilename string) ([]byte, error) { +func generatePNG(resultsDir, inputFilename, outputFilename, configFilename string) error { pwd, err := os.Getwd() if err != nil { - return nil, err + return err } - gnuplotCfg := filepath.Join(filepath.Dir(pwd), "scripts", "requests-plot.gp") + gnuplotCfg := filepath.Join(filepath.Dir(pwd), "scripts", configFilename) files := fmt.Sprintf("inputfile='%s';outputfile='%s'", inputFilename, outputFilename) cmd := exec.Command("gnuplot", "-e", files, "-c", gnuplotCfg) cmd.Dir = resultsDir - return cmd.CombinedOutput() + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to generate PNG: %w; output: %s", err, string(output)) + } + + return nil +} + +// GenerateRequestsPNG generates a Requests PNG using gnuplot. +func GenerateRequestsPNG(resultsDir, inputFilename, outputFilename string) error { + return generatePNG(resultsDir, inputFilename, outputFilename, "requests-plot.gp") +} + +// GenerateTTRPNG generates a TTR PNG using gnuplot. +func GenerateTTRPNG(resultsDir, inputFilename, outputFilename string) error { + return generatePNG(resultsDir, inputFilename, outputFilename, "ttr-plot.gp") +} + +// GenerateCPUPNG generates a CPU usage PNG using gnuplot. +func GenerateCPUPNG(resultsDir, inputFilename, outputFilename string) error { + return generatePNG(resultsDir, inputFilename, outputFilename, "cpu-plot.gp") } -// WriteResults writes the vegeta metrics results to the results file in text format. -func WriteResults(resultsFile *os.File, metrics *Metrics) error { +// GenerateMemoryPNG generates a Memory usage PNG using gnuplot. +func GenerateMemoryPNG(resultsDir, inputFilename, outputFilename string) error { + return generatePNG(resultsDir, inputFilename, outputFilename, "memory-plot.gp") +} + +// WriteMetricsResults writes the metrics results to the results file in text format. +func WriteMetricsResults(resultsFile *os.File, metrics *Metrics) error { reporter := vegeta.NewTextReporter(&metrics.Metrics) return reporter.Report(resultsFile) @@ -96,7 +121,27 @@ func WriteContent(resultsFile *os.File, content string) error { return nil } -// NewCSVEncoder returns a vegeta CSV encoder. -func NewCSVEncoder(w io.Writer) vegeta.Encoder { +// NewVegetaCSVEncoder returns a vegeta CSV encoder. +func NewVegetaCSVEncoder(w io.Writer) vegeta.Encoder { return vegeta.NewCSVEncoder(w) } + +// NewCSVResultsWriter creates and returns a CSV results file and writer. +func NewCSVResultsWriter(resultsDir, fileName string, resultHeaders ...string) (*os.File, *csv.Writer, error) { + if err := os.MkdirAll(resultsDir, 0o750); err != nil { + return nil, nil, err + } + + file, err := os.OpenFile(filepath.Join(resultsDir, fileName), os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o644) + if err != nil { + return nil, nil, err + } + + writer := csv.NewWriter(file) + + if err = writer.Write(resultHeaders); err != nil { + return nil, nil, err + } + + return file, writer, nil +} diff --git a/tests/framework/timeout.go b/tests/framework/timeout.go index 2aee2e5a21..2ed69457db 100644 --- a/tests/framework/timeout.go +++ b/tests/framework/timeout.go @@ -6,9 +6,15 @@ type TimeoutConfig struct { // CreateTimeout represents the maximum time for a Kubernetes object to be created. CreateTimeout time.Duration + // UpdateTimeout represents the maximum time for a Kubernetes object to be updated. + UpdateTimeout time.Duration + // DeleteTimeout represents the maximum time for a Kubernetes object to be deleted. DeleteTimeout time.Duration + // DeleteNamespaceTimeout represents the maximum time for a Kubernetes namespace to be deleted. + DeleteNamespaceTimeout time.Duration + // GetTimeout represents the maximum time to get a Kubernetes object. GetTimeout time.Duration @@ -29,7 +35,9 @@ type TimeoutConfig struct { func DefaultTimeoutConfig() TimeoutConfig { return TimeoutConfig{ CreateTimeout: 60 * time.Second, + UpdateTimeout: 60 * time.Second, DeleteTimeout: 10 * time.Second, + DeleteNamespaceTimeout: 60 * time.Second, GetTimeout: 10 * time.Second, ManifestFetchTimeout: 10 * time.Second, RequestTimeout: 10 * time.Second, diff --git a/tests/scale/results/1.0.0/1.0.0.md b/tests/results/scale/1.0.0/1.0.0.md similarity index 100% rename from tests/scale/results/1.0.0/1.0.0.md rename to tests/results/scale/1.0.0/1.0.0.md diff --git a/tests/scale/results/1.0.0/TestScale_HTTPRoutes/CPU-no-delay.png b/tests/results/scale/1.0.0/TestScale_HTTPRoutes/CPU-no-delay.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPRoutes/CPU-no-delay.png rename to tests/results/scale/1.0.0/TestScale_HTTPRoutes/CPU-no-delay.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPRoutes/CPU.png b/tests/results/scale/1.0.0/TestScale_HTTPRoutes/CPU.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPRoutes/CPU.png rename to tests/results/scale/1.0.0/TestScale_HTTPRoutes/CPU.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPRoutes/Memory-no-delay.png b/tests/results/scale/1.0.0/TestScale_HTTPRoutes/Memory-no-delay.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPRoutes/Memory-no-delay.png rename to tests/results/scale/1.0.0/TestScale_HTTPRoutes/Memory-no-delay.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPRoutes/Memory.png b/tests/results/scale/1.0.0/TestScale_HTTPRoutes/Memory.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPRoutes/Memory.png rename to tests/results/scale/1.0.0/TestScale_HTTPRoutes/Memory.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPRoutes/TTR.png b/tests/results/scale/1.0.0/TestScale_HTTPRoutes/TTR.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPRoutes/TTR.png rename to tests/results/scale/1.0.0/TestScale_HTTPRoutes/TTR.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPRoutes/results-no-delay.csv b/tests/results/scale/1.0.0/TestScale_HTTPRoutes/results-no-delay.csv similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPRoutes/results-no-delay.csv rename to tests/results/scale/1.0.0/TestScale_HTTPRoutes/results-no-delay.csv diff --git a/tests/scale/results/1.0.0/TestScale_HTTPRoutes/results.csv b/tests/results/scale/1.0.0/TestScale_HTTPRoutes/results.csv similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPRoutes/results.csv rename to tests/results/scale/1.0.0/TestScale_HTTPRoutes/results.csv diff --git a/tests/scale/results/1.0.0/TestScale_HTTPSListeners/CPU.png b/tests/results/scale/1.0.0/TestScale_HTTPSListeners/CPU.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPSListeners/CPU.png rename to tests/results/scale/1.0.0/TestScale_HTTPSListeners/CPU.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPSListeners/Memory.png b/tests/results/scale/1.0.0/TestScale_HTTPSListeners/Memory.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPSListeners/Memory.png rename to tests/results/scale/1.0.0/TestScale_HTTPSListeners/Memory.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPSListeners/TTR.png b/tests/results/scale/1.0.0/TestScale_HTTPSListeners/TTR.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPSListeners/TTR.png rename to tests/results/scale/1.0.0/TestScale_HTTPSListeners/TTR.png diff --git a/tests/scale/results/1.0.0/TestScale_HTTPSListeners/results.csv b/tests/results/scale/1.0.0/TestScale_HTTPSListeners/results.csv similarity index 100% rename from tests/scale/results/1.0.0/TestScale_HTTPSListeners/results.csv rename to tests/results/scale/1.0.0/TestScale_HTTPSListeners/results.csv diff --git a/tests/scale/results/1.0.0/TestScale_Listeners/CPU.png b/tests/results/scale/1.0.0/TestScale_Listeners/CPU.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_Listeners/CPU.png rename to tests/results/scale/1.0.0/TestScale_Listeners/CPU.png diff --git a/tests/scale/results/1.0.0/TestScale_Listeners/Memory.png b/tests/results/scale/1.0.0/TestScale_Listeners/Memory.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_Listeners/Memory.png rename to tests/results/scale/1.0.0/TestScale_Listeners/Memory.png diff --git a/tests/scale/results/1.0.0/TestScale_Listeners/TTR.png b/tests/results/scale/1.0.0/TestScale_Listeners/TTR.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_Listeners/TTR.png rename to tests/results/scale/1.0.0/TestScale_Listeners/TTR.png diff --git a/tests/scale/results/1.0.0/TestScale_Listeners/results.csv b/tests/results/scale/1.0.0/TestScale_Listeners/results.csv similarity index 100% rename from tests/scale/results/1.0.0/TestScale_Listeners/results.csv rename to tests/results/scale/1.0.0/TestScale_Listeners/results.csv diff --git a/tests/scale/results/1.0.0/TestScale_UpstreamServers/CPU.png b/tests/results/scale/1.0.0/TestScale_UpstreamServers/CPU.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_UpstreamServers/CPU.png rename to tests/results/scale/1.0.0/TestScale_UpstreamServers/CPU.png diff --git a/tests/scale/results/1.0.0/TestScale_UpstreamServers/Memory.png b/tests/results/scale/1.0.0/TestScale_UpstreamServers/Memory.png similarity index 100% rename from tests/scale/results/1.0.0/TestScale_UpstreamServers/Memory.png rename to tests/results/scale/1.0.0/TestScale_UpstreamServers/Memory.png diff --git a/tests/scale/results/1.1.0/1.1.0.md b/tests/results/scale/1.1.0/1.1.0.md similarity index 100% rename from tests/scale/results/1.1.0/1.1.0.md rename to tests/results/scale/1.1.0/1.1.0.md diff --git a/tests/scale/results/1.1.0/TestScale_HTTPRoutes/CPU-no-delay.png b/tests/results/scale/1.1.0/TestScale_HTTPRoutes/CPU-no-delay.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPRoutes/CPU-no-delay.png rename to tests/results/scale/1.1.0/TestScale_HTTPRoutes/CPU-no-delay.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPRoutes/CPU.png b/tests/results/scale/1.1.0/TestScale_HTTPRoutes/CPU.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPRoutes/CPU.png rename to tests/results/scale/1.1.0/TestScale_HTTPRoutes/CPU.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPRoutes/Memory-no-delay.png b/tests/results/scale/1.1.0/TestScale_HTTPRoutes/Memory-no-delay.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPRoutes/Memory-no-delay.png rename to tests/results/scale/1.1.0/TestScale_HTTPRoutes/Memory-no-delay.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPRoutes/Memory.png b/tests/results/scale/1.1.0/TestScale_HTTPRoutes/Memory.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPRoutes/Memory.png rename to tests/results/scale/1.1.0/TestScale_HTTPRoutes/Memory.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPRoutes/TTR.png b/tests/results/scale/1.1.0/TestScale_HTTPRoutes/TTR.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPRoutes/TTR.png rename to tests/results/scale/1.1.0/TestScale_HTTPRoutes/TTR.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPRoutes/results-no-delay.csv b/tests/results/scale/1.1.0/TestScale_HTTPRoutes/results-no-delay.csv similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPRoutes/results-no-delay.csv rename to tests/results/scale/1.1.0/TestScale_HTTPRoutes/results-no-delay.csv diff --git a/tests/scale/results/1.1.0/TestScale_HTTPRoutes/results.csv b/tests/results/scale/1.1.0/TestScale_HTTPRoutes/results.csv similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPRoutes/results.csv rename to tests/results/scale/1.1.0/TestScale_HTTPRoutes/results.csv diff --git a/tests/scale/results/1.1.0/TestScale_HTTPSListeners/CPU.png b/tests/results/scale/1.1.0/TestScale_HTTPSListeners/CPU.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPSListeners/CPU.png rename to tests/results/scale/1.1.0/TestScale_HTTPSListeners/CPU.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPSListeners/Memory.png b/tests/results/scale/1.1.0/TestScale_HTTPSListeners/Memory.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPSListeners/Memory.png rename to tests/results/scale/1.1.0/TestScale_HTTPSListeners/Memory.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPSListeners/TTR.png b/tests/results/scale/1.1.0/TestScale_HTTPSListeners/TTR.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPSListeners/TTR.png rename to tests/results/scale/1.1.0/TestScale_HTTPSListeners/TTR.png diff --git a/tests/scale/results/1.1.0/TestScale_HTTPSListeners/results.csv b/tests/results/scale/1.1.0/TestScale_HTTPSListeners/results.csv similarity index 100% rename from tests/scale/results/1.1.0/TestScale_HTTPSListeners/results.csv rename to tests/results/scale/1.1.0/TestScale_HTTPSListeners/results.csv diff --git a/tests/scale/results/1.1.0/TestScale_Listeners/CPU.png b/tests/results/scale/1.1.0/TestScale_Listeners/CPU.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_Listeners/CPU.png rename to tests/results/scale/1.1.0/TestScale_Listeners/CPU.png diff --git a/tests/scale/results/1.1.0/TestScale_Listeners/Memory.png b/tests/results/scale/1.1.0/TestScale_Listeners/Memory.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_Listeners/Memory.png rename to tests/results/scale/1.1.0/TestScale_Listeners/Memory.png diff --git a/tests/scale/results/1.1.0/TestScale_Listeners/TTR.png b/tests/results/scale/1.1.0/TestScale_Listeners/TTR.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_Listeners/TTR.png rename to tests/results/scale/1.1.0/TestScale_Listeners/TTR.png diff --git a/tests/scale/results/1.1.0/TestScale_Listeners/results.csv b/tests/results/scale/1.1.0/TestScale_Listeners/results.csv similarity index 100% rename from tests/scale/results/1.1.0/TestScale_Listeners/results.csv rename to tests/results/scale/1.1.0/TestScale_Listeners/results.csv diff --git a/tests/scale/results/1.1.0/TestScale_UpstreamServers/CPU.png b/tests/results/scale/1.1.0/TestScale_UpstreamServers/CPU.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_UpstreamServers/CPU.png rename to tests/results/scale/1.1.0/TestScale_UpstreamServers/CPU.png diff --git a/tests/scale/results/1.1.0/TestScale_UpstreamServers/Memory.png b/tests/results/scale/1.1.0/TestScale_UpstreamServers/Memory.png similarity index 100% rename from tests/scale/results/1.1.0/TestScale_UpstreamServers/Memory.png rename to tests/results/scale/1.1.0/TestScale_UpstreamServers/Memory.png diff --git a/tests/scale/results/1.2.0/1.2.0.md b/tests/results/scale/1.2.0/1.2.0.md similarity index 100% rename from tests/scale/results/1.2.0/1.2.0.md rename to tests/results/scale/1.2.0/1.2.0.md diff --git a/tests/scale/results/1.2.0/TestScale_HTTPRoutes/CPU.png b/tests/results/scale/1.2.0/TestScale_HTTPRoutes/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPRoutes/CPU.png rename to tests/results/scale/1.2.0/TestScale_HTTPRoutes/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPRoutes/Memory.png b/tests/results/scale/1.2.0/TestScale_HTTPRoutes/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPRoutes/Memory.png rename to tests/results/scale/1.2.0/TestScale_HTTPRoutes/Memory.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPRoutes/TTR-without-peak.png b/tests/results/scale/1.2.0/TestScale_HTTPRoutes/TTR-without-peak.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPRoutes/TTR-without-peak.png rename to tests/results/scale/1.2.0/TestScale_HTTPRoutes/TTR-without-peak.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPRoutes/TTR.png b/tests/results/scale/1.2.0/TestScale_HTTPRoutes/TTR.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPRoutes/TTR.png rename to tests/results/scale/1.2.0/TestScale_HTTPRoutes/TTR.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPRoutes_Plus/CPU.png b/tests/results/scale/1.2.0/TestScale_HTTPRoutes_Plus/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPRoutes_Plus/CPU.png rename to tests/results/scale/1.2.0/TestScale_HTTPRoutes_Plus/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPRoutes_Plus/Memory.png b/tests/results/scale/1.2.0/TestScale_HTTPRoutes_Plus/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPRoutes_Plus/Memory.png rename to tests/results/scale/1.2.0/TestScale_HTTPRoutes_Plus/Memory.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPRoutes_Plus/TTR.png b/tests/results/scale/1.2.0/TestScale_HTTPRoutes_Plus/TTR.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPRoutes_Plus/TTR.png rename to tests/results/scale/1.2.0/TestScale_HTTPRoutes_Plus/TTR.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPSListeners/CPU.png b/tests/results/scale/1.2.0/TestScale_HTTPSListeners/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPSListeners/CPU.png rename to tests/results/scale/1.2.0/TestScale_HTTPSListeners/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPSListeners/Memory.png b/tests/results/scale/1.2.0/TestScale_HTTPSListeners/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPSListeners/Memory.png rename to tests/results/scale/1.2.0/TestScale_HTTPSListeners/Memory.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPSListeners/TTR.png b/tests/results/scale/1.2.0/TestScale_HTTPSListeners/TTR.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPSListeners/TTR.png rename to tests/results/scale/1.2.0/TestScale_HTTPSListeners/TTR.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPSListeners_Plus/CPU.png b/tests/results/scale/1.2.0/TestScale_HTTPSListeners_Plus/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPSListeners_Plus/CPU.png rename to tests/results/scale/1.2.0/TestScale_HTTPSListeners_Plus/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPSListeners_Plus/Memory.png b/tests/results/scale/1.2.0/TestScale_HTTPSListeners_Plus/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPSListeners_Plus/Memory.png rename to tests/results/scale/1.2.0/TestScale_HTTPSListeners_Plus/Memory.png diff --git a/tests/scale/results/1.2.0/TestScale_HTTPSListeners_Plus/TTR.png b/tests/results/scale/1.2.0/TestScale_HTTPSListeners_Plus/TTR.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_HTTPSListeners_Plus/TTR.png rename to tests/results/scale/1.2.0/TestScale_HTTPSListeners_Plus/TTR.png diff --git a/tests/scale/results/1.2.0/TestScale_Listeners/CPU.png b/tests/results/scale/1.2.0/TestScale_Listeners/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_Listeners/CPU.png rename to tests/results/scale/1.2.0/TestScale_Listeners/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_Listeners/Memory.png b/tests/results/scale/1.2.0/TestScale_Listeners/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_Listeners/Memory.png rename to tests/results/scale/1.2.0/TestScale_Listeners/Memory.png diff --git a/tests/scale/results/1.2.0/TestScale_Listeners/TTR.png b/tests/results/scale/1.2.0/TestScale_Listeners/TTR.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_Listeners/TTR.png rename to tests/results/scale/1.2.0/TestScale_Listeners/TTR.png diff --git a/tests/scale/results/1.2.0/TestScale_Listeners_Plus/CPU.png b/tests/results/scale/1.2.0/TestScale_Listeners_Plus/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_Listeners_Plus/CPU.png rename to tests/results/scale/1.2.0/TestScale_Listeners_Plus/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_Listeners_Plus/Memory.png b/tests/results/scale/1.2.0/TestScale_Listeners_Plus/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_Listeners_Plus/Memory.png rename to tests/results/scale/1.2.0/TestScale_Listeners_Plus/Memory.png diff --git a/tests/scale/results/1.2.0/TestScale_Listeners_Plus/TTR.png b/tests/results/scale/1.2.0/TestScale_Listeners_Plus/TTR.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_Listeners_Plus/TTR.png rename to tests/results/scale/1.2.0/TestScale_Listeners_Plus/TTR.png diff --git a/tests/scale/results/1.2.0/TestScale_UpstreamServers/CPU.png b/tests/results/scale/1.2.0/TestScale_UpstreamServers/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_UpstreamServers/CPU.png rename to tests/results/scale/1.2.0/TestScale_UpstreamServers/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_UpstreamServers/Memory.png b/tests/results/scale/1.2.0/TestScale_UpstreamServers/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_UpstreamServers/Memory.png rename to tests/results/scale/1.2.0/TestScale_UpstreamServers/Memory.png diff --git a/tests/scale/results/1.2.0/TestScale_UpstreamServers_Plus/CPU.png b/tests/results/scale/1.2.0/TestScale_UpstreamServers_Plus/CPU.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_UpstreamServers_Plus/CPU.png rename to tests/results/scale/1.2.0/TestScale_UpstreamServers_Plus/CPU.png diff --git a/tests/scale/results/1.2.0/TestScale_UpstreamServers_Plus/Memory.png b/tests/results/scale/1.2.0/TestScale_UpstreamServers_Plus/Memory.png similarity index 100% rename from tests/scale/results/1.2.0/TestScale_UpstreamServers_Plus/Memory.png rename to tests/results/scale/1.2.0/TestScale_UpstreamServers_Plus/Memory.png diff --git a/tests/scale/manifests/prom-clusterrole.yaml b/tests/scale/manifests/prom-clusterrole.yaml deleted file mode 100644 index f8aefdd36e..0000000000 --- a/tests/scale/manifests/prom-clusterrole.yaml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: prom ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus - namespace: prom ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus - namespace: prom -rules: -- apiGroups: [""] - resources: - - nodes - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus - namespace: prom -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus - namespace: prom diff --git a/tests/scale/scale.md b/tests/scale/scale.md deleted file mode 100644 index 328378596c..0000000000 --- a/tests/scale/scale.md +++ /dev/null @@ -1,527 +0,0 @@ -# Scale Tests - -This document describes how we scale test NGF. - - -- [Scale Tests](#scale-tests) - - [Goals](#goals) - - [Test Environment](#test-environment) - - [Steps](#steps) - - [Setup](#setup) - - [Run the tests](#run-the-tests) - - [Scale Listeners to Max of 64](#scale-listeners-to-max-of-64) - - [Scale HTTPS Listeners to Max of 64](#scale-https-listeners-to-max-of-64) - - [Scale HTTPRoutes](#scale-httproutes) - - [Scale Upstream Servers](#scale-upstream-servers) - - [Scale HTTP Matches](#scale-http-matches) - - [Analyze](#analyze) - - [Results](#results) - - -## Goals - -- Measure how NGF performs when the number of Gateway API and referenced core Kubernetes resources are scaled. -- Test the following number of resources: - - Max number of HTTP and HTTPS Listeners (64) - - Max number of Upstream Servers (648) - - Max number of HTTPMatches - - 1000 HTTPRoutes - -## Test Environment - -For most of the tests, the following cluster will be sufficient: - -- A Kubernetes cluster with 4 nodes on GKE - - Node: n2d-standard-8 (8 vCPU, 32GB memory) - - Enabled GKE logging -- A GKE VM with access to the cluster. Send all requests from the GKE VM. - -The Upstream Server scale test requires a bigger cluster to accommodate the large number of Pods. Those cluster details -are listed in the [Scale Upstream Servers](#scale-upstream-servers) test steps. - -## Steps - -### Setup - -- Update the version in the [test](scale_test.go) to the release version - -- Install Gateway API Resources: - - ```console - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.1.0/standard-install.yaml - ``` - -- Install edge NGF: - - For OSS: - - ```console - helm install scale-test oci://ghcr.io/nginxinc/charts/nginx-gateway-fabric --create-namespace --wait -n nginx-gateway --version=0.0.0-edge --set nginxGateway.config.logging.level=debug - ``` - - For Plus: - - - Build the NGINX Plus image - - Push the image to the GCR registry - - ```console - helm install scale-test oci://ghcr.io/nginxinc/charts/nginx-gateway-fabric --create-namespace --wait -n nginx-gateway --version=0.0.0-edge --set nginxGateway.config.logging.level=debug --set nginx.plus=true --set nginx.image.repository= --set nginx.image.tag= - ``` - -- Save the Pod Name and LoadBalancer IP for tests: - - ```console - export NGF_IP=$(kubectl get svc -n nginx-gateway scale-test-nginx-gateway-fabric --output jsonpath='{.status.loadBalancer.ingress[0].ip}') - export NGF_POD=$(kubectl get pods -n nginx-gateway -l "app.kubernetes.io/name=nginx-gateway-fabric,app.kubernetes.io/instance=scale-test" -o jsonpath="{.items[0].metadata.name}") - ``` - -- Install Prometheus: - - ```console - kubectl apply -f manifests/prom-clusterrole.yaml - helm repo add prometheus-community https://prometheus-community.github.io/helm-charts - helm repo update - helm install prom prometheus-community/prometheus --set useExistingClusterRoleName=prometheus -n prom - ``` - -- Create a directory under [results](/tests/scale/results) and name it after the version of NGF you are testing. Then - create a file for the result summary, also named after the NGF version. For - example: [1.0.0.md](/tests/scale/results/1.0.0/1.0.0.md). - -### Run the tests - -Run the following tests for both N+ and OSS. - -#### Scale Listeners to Max of 64 - -Test Goal: Measure how NGF performs as the number of Listeners increases to the max of 64. - -Test Plan: - -- Scale up to 64 HTTP Listeners -- All Listeners are on a single Gateway -- Each Listener has 1 HTTPRoute attached -- Each HTTPRoute references 1 unique Service -- Services and Deployments are created before scaling Listeners. -- After each Listener + HTTPRoute is created, measure the time it takes to get a successful response from the new - route (time to ready). -- Record the time to ready in seconds in a csv file for each iteration. - -Total Resources Created: - -- 1 Gateway with 64 Listeners -- 64 HTTPRoutes -- 64 Services, Deployments, Pods - -Follow the steps below to run the test: - -- Run the test: - - With OSS: - - ```console - go test -v -tags scale -run TestScale_Listeners -i 64 - ``` - - With Plus: - - ```console - go test -v -tags scale -run TestScale_Listeners -i 64 -plus - ``` - -- [Analyze](#analyze) the results. - -- Clean up: - - Delete resources from cluster: - - ```console - kubectl delete -Rf TestScale_Listeners - ``` - - Delete generated manifests: - - ```console - rm -rf TestScale_Listeners - ``` - -- Check for any errors or restarts after cleanup. -- Check NGINX conf to make sure it looks correct. - -#### Scale HTTPS Listeners to Max of 64 - -Test Goal: Measure how NGF performs as the number of HTTPS Listeners increases to the max of 64. - -Test Plan: - -- Scale up to 64 HTTPS Listeners -- All Listeners are on a single Gateway -- Each Listener has 1 HTTPRoute attached -- Each Listener references a unique Secret -- Each HTTPRoute references 1 unique Service -- Services, Deployments, and Secrets are created before scaling Listeners -- After each Listener + HTTPRoute is created, measure the time it takes to get a successful response from the new - route (time to ready). -- Record the time to ready in seconds in a csv file for each iteration. - -Total Resources Created: - -- 1 Gateway with 64 HTTPS Listeners -- 64 Secrets -- 64 HTTPRoutes -- 64 Services, Deployments, Pods - -Follow the steps below to run the test: - -- Run the test: - - With OSS: - - ```console - go test -v -tags scale -run TestScale_HTTPSListeners -i 64 - ``` - - With Plus: - - ```console - go test -v -tags scale -run TestScale_HTTPSListeners -i 64 -plus - ``` - -- [Analyze](#analyze) the results. - -- Clean up: - - Delete resources from cluster: - - ```console - kubectl delete -Rf TestScale_HTTPSListeners - ``` - - Delete generated manifests: - - ```console - rm -rf TestScale_HTTPSListeners - ``` - -- Check for any errors or restarts after cleanup. -- Check NGINX conf to make sure it looks correct. - -#### Scale HTTPRoutes - -Test Goal: Measure how NGF performs as the number of HTTPRoutes increases to 1000. - -Test Plan: - -- Scale up to 1000 HTTPRoutes -- All HTTPRoutes attach to a single Gateway with one Listener -- Each HTTPRoute references the same Service -- Service and Deployment are created before scaling HTTPRoutes -- After each HTTPRoute is created, measure the time it takes to get a successful response from the new route (time to - ready). -- Record the time to ready in seconds in a csv file for each iteration. - -Total Resources Created: - -- 1 Gateway with 1 Listener -- 1000 HTTPRoutes -- 1 Service, Deployment, Pod - -Follow the steps below to run the test: - -- Run the test: - - With OSS: - - ```console - go test -v -tags scale -timeout 30m -run TestScale_HTTPRoutes -i 1000 - ``` - - With Plus: - - ```console - go test -v -tags scale -timeout 30m -run TestScale_HTTPRoutes -i 1000 -plus - ``` - - **Optional:** To test with a delay in between each new HTTPRoute, you can add the `-delay` flag to the above command. For example, - to add a 2-second delay: - - ```console - go test -v -tags scale -timeout 60m -run TestScale_HTTPRoutes -i 1000 -delay 2s - ``` - - The test takes longer to run with a delay so make sure to adjust the timeout value. - -- [Analyze](#analyze) the results. - -- Clean up: - - Delete resources from cluster: - - ```console - kubectl delete -Rf TestScale_HTTPRoutes - ``` - - Delete generated manifests: - - ```console - rm -rf TestScale_HTTPRoutes - ``` - -- Check for any errors or restarts after cleanup. -- Check NGINX conf to make sure it looks correct. - -#### Scale Upstream Servers - -Test Goal: Measure how NGF performs as the number of Upstream Servers increases to the max of 648. - -Test Plan: - -- Deploy a single Gateway with 1 Listener and attach one HTTPRoute that references a single Service -- Scale the deployment for that Service to 648 Pods for OSS and 556 Pods for Plus (these are the limits that the upstream zone size allows) -- Gateway, HTTPRoute, Service, and Deployment with 1 replica are created before scaling up to 648 replicas. - -Total Resources Created: - -- 1 Gateway with 1 Listener -- 1 HTTPRoutes -- 1 Service, 1 Deployment, 648 Pods - -Test Environment: - -For this test you must use a much bigger cluster in order to create 648 Pods. - -- A Kubernetes cluster with 12 nodes on GKE - - Node: n2d-standard-16 (16 vCPU, 64GB memory) - - Enabled GKE logging - -Follow the steps below to run the test: - -- Apply manifest - - ```console - kubectl apply -f manifests/scale-upstreams.yaml - ``` - -- Check the status of the Gateway and HTTPRoute to make sure everything is OK before scaling. - - ```console - kubectl describe gateway gateway - kubectl describe httproute route - ``` - -- Get the start time as a UNIX timestamp and record it in the results. - - ```console - date +%s - ``` - - This will be used in the metrics query. - -- Open a new terminal window and start the following loop: - - ```console - for i in $(seq 1 150); do curl --resolve cafe.example.com:80:$NGF_IP http://cafe.example.com:80/; sleep 1; done >> requests.log - ``` - -- Back in your original terminal, scale the backend app: - - For OSS: - - ```console - kubectl scale deploy backend --replicas 648 - ``` - - For Plus: - - ```console - kubectl scale deploy backend --replicas 556 - ``` - -- Wait for all Pods to become available: - - ```console - watch kubectl get deploy backend - ``` - -- Check the NGINX config for 648 upstream servers: - - ```console - kubectl exec -it -n nginx-gateway $NGF_POD -c nginx -- nginx -T | grep -E "server (?:[0-9]{1,3}\.){3}[0-9]{1,3}:8080" | wc -l - ``` - -- Get the end time as a UNIX timestamp and make a note of it: - - ```console - date +%s - ``` - -- In the terminal you started the request loop, kill the loop if it's still running and check the request.log to see if - any of the requests failed. Record any failures in the results file. - -- [Analyze](#analyze) the results. Use the start time and end time you made note of earlier for the - queries. You can calculate the test duration in seconds by subtracting the start time from the end time. - -- Clean up: - - ```console - kubectl delete -f manifests/scale-upstreams.yaml - ``` - -- Check for any errors or restarts after cleanup. -- Check NGINX conf to make sure it looks correct. - -#### Scale HTTP Matches - -Test Goal: Find the difference in latency between the first match and last match for the max length of -the `http_matches` variable. - -Test Plan: - -- Deploy a single Gateway with 1 Listener and attach one HTTPRoute that references a single Service -- Within the HTTPRoute configure the max number of matches (max is determined by the length of the - generated `http_matches` variable (4096 characters)) -- Use `wrk` to send requests to the _first_ match in `http_matches` list and measure the latency -- Use `wrk` to send requests to the _last_ match in `http_matches` list and measure the latency - -Total Resources Created: - -- 1 Gateway with 1 Listener -- 1 HTTPRoute with 7 rules and 50 matches -- 1 Service, 1 Deployment, 1 Pod - -Follow these steps to run the test: - -- Download [wrk](https://github.com/wg/wrk) - -- Apply manifest: - - ```console - kubectl apply -f manifests/scale-matches.yaml - ``` - -- Check the status of the Gateway and HTTPRoute to make sure everything is OK before scaling. - - ```console - kubectl describe gateway gateway - kubectl describe httproute route - ``` - -- Edit your /etc/hosts file and add an entry for "NGF_IP cafe.example.com". - -- Test the first match: - - ```console - wrk -t2 -c10 -d30 http://cafe.example.com -H "header-1: header-1-val" - ``` - -- Test the last match: - - ```console - wrk -t2 -c10 -d30 http://cafe.example.com -H "header-50: header-50-val" - ``` - -- Copy and paste the results into the results file. - -- Clean up: - - ```console - kubectl delete -f manifests/scale-matches.yaml - ``` - -### Analyze - -- Query Prometheus for reload metrics. To access the Prometheus Server, run: - - ```console - export POD_NAME=$(kubectl get pods --namespace prom -l "app.kubernetes.io/name=prometheus,app.kubernetes.io/instance=prom" -o jsonpath="{.items[0].metadata.name}") - kubectl --namespace prom port-forward $POD_NAME 9090 - ``` - - To query Prometheus, you can either browse to localhost:9090 or use curl. The following instructions assume you are - using the prom GUI. - - > Note: - > For the tests that write to a csv file, the `Test Start`, `Test End + 10s`, and `Duration` are at the - > end of the results.csv file in the `results//` directory. - > If you ran the tests with the `plus` flag, the test directory is named `results//_Plus`. - > We are using `Test End + 10s` in the Prometheus query to account for the 10s scraping interval. - - Total number of reloads: - - ```console - nginx_gateway_fabric_nginx_reloads_total - nginx_gateway_fabric_nginx_reloads_total @ - ``` - - Total number of reload errors: - - ```console - nginx_gateway_fabric_nginx_reload_errors_total - nginx_gateway_fabric_nginx_reload_errors_total @ - ``` - - Average reload time (ms): - - ```console - rate(nginx_gateway_fabric_nginx_reloads_milliseconds_sum[] @ ) / - rate(nginx_gateway_fabric_nginx_reloads_milliseconds_count[] @ ) - ``` - - Reload Time Distribution: - - ```console - nginx_gateway_fabric_nginx_reloads_milliseconds_bucket - nginx_gateway_fabric_nginx_reloads_milliseconds_bucket @ - ``` - - Total number of event batches processed: - - ```console - nginx_gateway_fabric_event_batch_processing_milliseconds_count - nginx_gateway_fabric_event_batch_processing_milliseconds_count @ - ``` - - Average event batch processing time (ms): - - ```console - rate(nginx_gateway_fabric_event_batch_processing_milliseconds_sum[] @ ) / - rate(nginx_gateway_fabric_event_batch_processing_milliseconds_count[] @ ) - ``` - - Event Batch Processing Time Distribution: - - ```console - nginx_gateway_fabric_event_batch_processing_milliseconds_bucket - nginx_gateway_fabric_event_batch_processing_milliseconds_bucket @ - ``` - - Record these numbers in a table in the results file. - -- Take screenshots of memory and CPU usage in GKE Dashboard - - To Monitor memory and CPU usage, navigate to the Kubernetes Engine > Workloads > Filter by `nginx-gateway` namespace > - click on NGF Pod name. You should see graphs for CPU, Memory, and Disk. - - - Convert the `Start Time` and `End Time` UNIX timestamps to your local date time: - - ```console - date -r - ``` - - - Create a custom time frame for the graphs in GKE. - - Take a screenshot of the CPU and Memory graphs individually. Store them in the `results//` - directory. - -- If the test writes time to ready numbers to a csv, create a time to ready graph. - - Use https://chart-studio.plotly.com/create/#/ to plot the time to ready numbers on a graph. - - Remove the `"Test Start", "Test End", "Test End + 10s", "Duration"` rows from the bottom of the csv. - - Upload the csv file to plotly. - - Create a new `Trace`, select `Line` as the type. - - Set the Y axis to the Time to Ready column. - - Set the X axis to the number of resources column. - - Label the graph and take a screenshot. - - Store the graph in the `results//` directory. - -- Check for errors or restarts and record in the results file. File a bug if there's unexpected errors or restarts. -- Check NGINX conf and make sure it looks correct. File a bug if there is an issue. - -### Results - -- [1.0.0](/tests/scale/results/1.0.0/1.0.0.md) -- [1.1.0](/tests/scale/results/1.1.0/1.1.0.md) -- [1.2.0](/tests/scale/results/1.2.0/1.2.0.md) diff --git a/tests/scale/scale_test.go b/tests/scale/scale_test.go deleted file mode 100644 index 89f87a1d82..0000000000 --- a/tests/scale/scale_test.go +++ /dev/null @@ -1,259 +0,0 @@ -//go:build scale -// +build scale - -package scale - -import ( - "context" - "crypto/tls" - "encoding/csv" - "flag" - "fmt" - "net/http" - "os" - "os/exec" - "path/filepath" - "strconv" - "strings" - "testing" - "time" - - "k8s.io/apimachinery/pkg/util/wait" -) - -// testing flags -var ( - numIterations = flag.Int("i", 1, "number of times to scale the resource") - delay = flag.Duration("delay", 0, "delay between each scaling iteration") - version = flag.String("version", "1.2.0", "version of NGF under test") - plus = flag.Bool("plus", false, "nginx-plus enabled") -) - -func TestScale_Listeners(t *testing.T) { - ip := getIP(t) - url := fmt.Sprintf("http://%s/", ip) - - runScaleTest( - t, - []string{"# Listeners", "Time to Ready (s)", "Error"}, - func(dir string) error { - return generateScaleListenerManifests(*numIterations, dir, false /*non-tls*/) - }, - url, - ) -} - -func TestScale_HTTPSListeners(t *testing.T) { - ip := getIP(t) - url := fmt.Sprintf("https://%s/", ip) - - runScaleTest( - t, - []string{"# HTTPS Listeners", "Time to Ready (s)", "Error"}, - func(dir string) error { - return generateScaleListenerManifests(*numIterations, dir, true /*tls*/) - }, - url, - ) -} - -func TestScale_HTTPRoutes(t *testing.T) { - ip := getIP(t) - url := fmt.Sprintf("http://%s/", ip) - - runScaleTest( - t, - []string{"# HTTPRoutes", "Time to Ready (s)", "Error"}, - func(dir string) error { - return generateScaleHTTPRouteManifests(*numIterations, dir) - }, - url, - ) -} - -func runScaleTest( - t *testing.T, - resultHeaders []string, - generateManifests func(dir string) error, - url string, -) { - t.Helper() - manifestDir := t.Name() - - writer := newResultsWriter(t, t.Name(), resultHeaders...) - - if err := generateManifests(manifestDir); err != nil { - t.Fatalf("failed to generate manifests: %s", err) - } - - startTime := time.Now() - startUnix := fmt.Sprintf("%d", startTime.Unix()) - - if err := kubectlApply(getPrereqDirName(manifestDir)); err != nil { - t.Fatalf("failed to apply prerequisite resources: %s", err) - } - - t.Log("Waiting for all Pods to be Ready") - if err := kubectlWaitAllPodsReady(); err != nil { - t.Fatalf("failed to wait for all Pods to be Ready: %s", err) - } - - for i := 0; i < *numIterations; i++ { - t.Logf("Scaling up to %d resources", i) - - manifestFile := filepath.Join(manifestDir, fmt.Sprintf("manifest-%d.yaml", i)) - - if err := kubectlApply(manifestFile); err != nil { - t.Errorf("failed to scale up: %s", err) - } - - host := fmt.Sprintf("%d.example.com", i) - - t.Logf("Sending request to url %s with host %s...", url, host) - - ttr, err := waitForResponseForHost(url, host) - - seconds := ttr.Seconds() - record := []string{strconv.Itoa(i + 1), strconv.FormatFloat(seconds, 'f', -1, 64)} - if err != nil { - record = append(record, err.Error()) - } - - if err = writer.Write(record); err != nil { - t.Fatalf("failed to write time to ready to csv file: %s", err) - } - - time.Sleep(*delay) - } - - endTime := time.Now() - endUnix := fmt.Sprintf("%d", endTime.Unix()) - - // This accounts for prometheus 10s scraping window - endUnixPlusTen := fmt.Sprintf("%d", endTime.Add(10*time.Second).Unix()) - - records := [][]string{ - {"Test Start", "Test End", "Test End + 10s", "Duration"}, - {startUnix, endUnix, endUnixPlusTen, endTime.Sub(startTime).String()}, - } - - if err := writer.WriteAll(records); err != nil { - t.Logf("failed to write records to csv") - } -} - -func getIP(t *testing.T) string { - t.Helper() - - ip := os.Getenv("NGF_IP") - if ip == "" { - t.Fatalf("NGF_IP env var not set") - } - - return ip -} - -func newResultsWriter(t *testing.T, testName string, resultHeaders ...string) *csv.Writer { - t.Helper() - - versionDir := filepath.Join("results", *version) - if err := os.Mkdir(versionDir, 0o750); err != nil && !os.IsExist(err) { - t.Fatalf("failed to create results version directory: %s", err) - } - - testDirName := testName - if *plus { - testDirName += "_Plus" - } - - dir := filepath.Join(versionDir, testDirName) - if err := os.Mkdir(dir, 0o750); err != nil { - t.Fatalf("failed to create results test directory: %s", err) - } - - file, err := os.Create(filepath.Join(dir, "results.csv")) - if err != nil { - t.Fatalf("failed to create results csv file: %s", err) - } - - writer := csv.NewWriter(file) - - if err = writer.Write(resultHeaders); err != nil { - t.Fatalf("failed to write headers to csv file: %s", err) - } - - t.Cleanup(func() { - writer.Flush() - _ = file.Close() - }) - - return writer -} - -func kubectlApply(filename string) error { - if err := kubectlExec("apply", "-f", filename); err != nil { - return fmt.Errorf("error applying %s: %w", filename, err) - } - - return nil -} - -func kubectlWaitAllPodsReady() error { - if err := kubectlExec("wait", "pod", "--all", "--for=condition=Ready"); err != nil { - return fmt.Errorf("error waiting for all pods to be ready:%w", err) - } - - return nil -} - -func kubectlExec(arg ...string) error { - cmd := exec.Command("kubectl", arg...) - - return cmd.Run() -} - -func waitForResponseForHost(url, host string) (time.Duration, error) { - client := &http.Client{} - - if strings.HasPrefix(url, "https") { - customTransport := http.DefaultTransport.(*http.Transport) - customTransport.TLSClientConfig = &tls.Config{ - InsecureSkipVerify: true, // nolint: gosec - ServerName: host, - } - client.Transport = customTransport - } - - ctx, cancel := context.WithTimeout(context.Background(), time.Minute) - defer cancel() - - req, err := http.NewRequestWithContext(ctx, "GET", url, nil) - if err != nil { - return 0, err - } - - req.Host = host - - start := time.Now() - - err = wait.PollUntilContextCancel( - ctx, - 200*time.Millisecond, - true, - func(ctx context.Context) (done bool, err error) { - resp, err := client.Do(req) - if err != nil { - fmt.Println("Retrying GET request", "error", err) - return false, err - } - - if resp.StatusCode == http.StatusOK { - return true, nil - } - - fmt.Println("Retrying GET request", "host", host, "status", resp.Status) - return false, nil - }) - - return time.Since(start), err -} diff --git a/tests/scripts/cpu-plot.gp b/tests/scripts/cpu-plot.gp new file mode 100644 index 0000000000..71584811a3 --- /dev/null +++ b/tests/scripts/cpu-plot.gp @@ -0,0 +1,20 @@ +set terminal png size 800,600 +set title "CPU Usage" +set datafile separator "," +set output outputfile . "" + +# X-axis settings +set xlabel "Timestamp" +set xdata time +set timefmt "%s" +set format x "%M:%S" +set xrange [*:*] +set xtics nomirror + +# Y-axis settings +set yrange [0:*] +set ylabel "CPU Usage (core seconds)" +set format y "%.2f" + +# Plotting data +plot inputfile using 1:2 with lines lw 2 notitle diff --git a/tests/scripts/create-gke-cluster.sh b/tests/scripts/create-gke-cluster.sh index 9d034e1c62..b5a2061f8b 100644 --- a/tests/scripts/create-gke-cluster.sh +++ b/tests/scripts/create-gke-cluster.sh @@ -6,6 +6,16 @@ ip_random_digit=$((1 + $RANDOM % 250)) IS_CI=${1:-false} +if [ -z "$GKE_MACHINE_TYPE" ]; then + # If the environment variable is not set, use a default value + GKE_MACHINE_TYPE="e2-medium" +fi + +if [ -z "$GKE_NUM_NODES" ]; then + # If the environment variable is not set, use a default value + GKE_NUM_NODES="3" +fi + gcloud container clusters create ${GKE_CLUSTER_NAME} \ --project ${GKE_PROJECT} \ --zone ${GKE_CLUSTER_ZONE} \ @@ -16,7 +26,9 @@ gcloud container clusters create ${GKE_CLUSTER_NAME} \ --master-ipv4-cidr 172.16.${ip_random_digit}.32/28 \ --metadata=block-project-ssh-keys=TRUE \ --monitoring=SYSTEM,POD,DEPLOYMENT \ - --logging=SYSTEM,WORKLOAD + --logging=SYSTEM,WORKLOAD \ + --machine-type ${GKE_MACHINE_TYPE} \ + --num-nodes ${GKE_NUM_NODES} # Add current IP to GKE master control node access, if this script is not invoked during a CI run. if [ "${IS_CI}" = "false" ]; then diff --git a/tests/scripts/memory-plot.gp b/tests/scripts/memory-plot.gp new file mode 100644 index 0000000000..c25614db70 --- /dev/null +++ b/tests/scripts/memory-plot.gp @@ -0,0 +1,21 @@ +# Define a function to convert bytes to Mebibytes +bytes_to_MiB(bytes) = bytes / (1024.0 * 1024.0) + +set terminal png size 800,600 +set title "Memory Usage" +set datafile separator "," +set output outputfile . "" + +# X-axis settings +set xlabel "Timestamp" +set xdata time +set timefmt "%s" +set format x "%M:%S" +set xrange [*:*] # Specify a range covering all timestamps + +# Y-axis settings +set yrange [0:*] +set ylabel "Memory Usage (MiB)" + +# Plotting data +plot inputfile using 1:(bytes_to_MiB($2)) with lines lw 2 notitle diff --git a/tests/scripts/ttr-plot.gp b/tests/scripts/ttr-plot.gp new file mode 100644 index 0000000000..97d0eb2891 --- /dev/null +++ b/tests/scripts/ttr-plot.gp @@ -0,0 +1,18 @@ +set terminal png size 800,600 +set title "Scaling resources" +set datafile separator "," +set output outputfile . "" + +# X-axis settings +set xrange [0:70] +set xtics 10 +set xlabel "# Resources" +set grid xtics + +# Y-axis settings +set yrange [0:*] +set ylabel "Time to Ready (s)" +set format y "%.1f" + +# Plotting data +plot inputfile using 1:2 with lines lw 2 notitle diff --git a/tests/scripts/vars.env-example b/tests/scripts/vars.env-example index c4163b0120..52138f113f 100644 --- a/tests/scripts/vars.env-example +++ b/tests/scripts/vars.env-example @@ -20,3 +20,5 @@ SOURCE_IP_RANGE= PLUS_ENABLED= NGF_VERSION= +GKE_MACHINE_TYPE= +GKE_NUM_NODES= diff --git a/tests/suite/dataplane_perf_test.go b/tests/suite/dataplane_perf_test.go index 743e79977c..3860133e79 100644 --- a/tests/suite/dataplane_perf_test.go +++ b/tests/suite/dataplane_perf_test.go @@ -77,7 +77,7 @@ var _ = Describe("Dataplane performance", Ordered, Label("nfr", "performance"), AfterAll(func() { Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) - Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) + Expect(resourceManager.DeleteNamespace(ns.Name)).To(Succeed()) outFile.Close() }) @@ -97,7 +97,7 @@ var _ = Describe("Dataplane performance", Ordered, Label("nfr", "performance"), } _, metrics := framework.RunLoadTest(cfg) - Expect(framework.WriteResults(outFile, &metrics)).To(Succeed()) + Expect(framework.WriteMetricsResults(outFile, &metrics)).To(Succeed()) _, err = fmt.Fprint(outFile, "```\n") Expect(err).ToNot(HaveOccurred()) diff --git a/tests/suite/graceful_recovery_test.go b/tests/suite/graceful_recovery_test.go index bf6c61295b..b4148039e4 100644 --- a/tests/suite/graceful_recovery_test.go +++ b/tests/suite/graceful_recovery_test.go @@ -78,7 +78,7 @@ var _ = Describe("Graceful Recovery test", Ordered, Label("nfr", "graceful-recov AfterAll(func() { Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) - Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) + Expect(resourceManager.DeleteNamespace(ns.Name)).To(Succeed()) }) It("recovers when NGF container is restarted", func() { diff --git a/tests/suite/longevity_test.go b/tests/suite/longevity_test.go index ec60aa431e..2ddf66c442 100644 --- a/tests/suite/longevity_test.go +++ b/tests/suite/longevity_test.go @@ -81,7 +81,7 @@ var _ = Describe("Longevity", Label("longevity-setup", "longevity-teardown"), fu Expect(writeTrafficResults(resultsFile, homeDir, "tea.txt", "HTTPS")).To(Succeed()) Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) - Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) + Expect(resourceManager.DeleteNamespace(ns.Name)).To(Succeed()) }) }) diff --git a/tests/scale/manifests/scale-matches.yaml b/tests/suite/manifests/scale/matches.yaml similarity index 100% rename from tests/scale/manifests/scale-matches.yaml rename to tests/suite/manifests/scale/matches.yaml diff --git a/tests/scale/manifests/scale-upstreams.yaml b/tests/suite/manifests/scale/upstreams.yaml similarity index 100% rename from tests/scale/manifests/scale-upstreams.yaml rename to tests/suite/manifests/scale/upstreams.yaml diff --git a/tests/suite/sample_test.go b/tests/suite/sample_test.go index 3996c67646..56d08713a6 100644 --- a/tests/suite/sample_test.go +++ b/tests/suite/sample_test.go @@ -34,7 +34,7 @@ var _ = Describe("Basic test example", Label("functional"), func() { AfterEach(func() { Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) - Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) + Expect(resourceManager.DeleteNamespace(ns.Name)).To(Succeed()) }) It("sends traffic", func() { diff --git a/tests/suite/scale_test.go b/tests/suite/scale_test.go new file mode 100644 index 0000000000..8d7c187db7 --- /dev/null +++ b/tests/suite/scale_test.go @@ -0,0 +1,797 @@ +package suite + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "text/template" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + core "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + ctlr "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/nginxinc/nginx-gateway-fabric/tests/framework" +) + +var _ = Describe("Scale test", Ordered, Label("nfr", "scale"), func() { + // One of the tests - scales upstream servers - requires a big cluster to provision 648 pods. + // On GKE, you can use the following configuration: + // - A Kubernetes cluster with 12 nodes on GKE + // - Node: n2d-standard-16 (16 vCPU, 64GB memory) + + var ( + matchesManifests = []string{ + "scale/matches.yaml", + } + upstreamsManifests = []string{ + "scale/upstreams.yaml", + } + + namespace = "scale" + + scrapeInterval = 15 * time.Second + queryRangeStep = 15 * time.Second + + resultsDir string + outFile *os.File + ngfPodName string + promInstance framework.PrometheusInstance + promPortForwardStopCh = make(chan struct{}) + ) + + const ( + httpListenerCount = 64 + httpsListenerCount = 64 + httpRouteCount = 1000 + upstreamServerCount = 648 + ) + + BeforeAll(func() { + // Scale tests need a dedicated NGF instance + // Because they analyze the logs of NGF and NGINX, and they don't want to analyze the logs of other tests. + teardown(releaseName) + + var err error + resultsDir, err = framework.CreateResultsDir("scale", version) + Expect(err).ToNot(HaveOccurred()) + + filename := filepath.Join(resultsDir, framework.CreateResultsFilename("md", version, *plusEnabled)) + outFile, err = framework.CreateResultsFile(filename) + Expect(err).ToNot(HaveOccurred()) + Expect(framework.WriteSystemInfoToFile(outFile, clusterInfo, *plusEnabled)).To(Succeed()) + + promCfg := framework.PrometheusConfig{ + ScrapeInterval: scrapeInterval, + } + + promInstance, err = framework.InstallPrometheus(resourceManager, promCfg) + Expect(err).ToNot(HaveOccurred()) + + k8sConfig := ctlr.GetConfigOrDie() + + if !clusterInfo.IsGKE { + Expect(promInstance.PortForward(k8sConfig, promPortForwardStopCh)).To(Succeed()) + } + }) + + BeforeEach(func() { + // Scale tests need a dedicated NGF instance per test. + // Because they analyze the logs of NGF and NGINX, and they don't want to analyze the logs of other tests. + cfg := getDefaultSetupCfg() + cfg.nfr = true + setup(cfg) + + ns := &core.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + } + Expect(resourceManager.Apply([]client.Object{ns})).To(Succeed()) + + podNames, err := framework.GetReadyNGFPodNames(k8sClient, ngfNamespace, releaseName, timeoutConfig.GetTimeout) + Expect(err).ToNot(HaveOccurred()) + Expect(podNames).To(HaveLen(1)) + ngfPodName = podNames[0] + }) + + createResponseChecker := func(url, address string) func() error { + return func() error { + status, _, err := framework.Get(url, address, timeoutConfig.RequestTimeout) + if err != nil { + return fmt.Errorf("bad response: %w", err) + } + + if status != 200 { + return fmt.Errorf("unexpected status code: %d", status) + } + + return nil + } + } + + createMetricExistChecker := func(query string, getTime func() time.Time, modifyTime func()) func() error { + return func() error { + queryWithTimestamp := fmt.Sprintf("%s @ %d", query, getTime().Unix()) + + result, err := promInstance.Query(queryWithTimestamp) + if err != nil { + return fmt.Errorf("failed to query Prometheus: %w", err) + } + + if result.String() == "" { + modifyTime() + return errors.New("empty result") + } + + return nil + } + } + + createEndTimeFinder := func(query string, startTime time.Time, t *time.Time) func() error { + return func() error { + result, err := promInstance.QueryRange(query, v1.Range{ + Start: startTime, + End: *t, + Step: queryRangeStep, + }) + if err != nil { + return fmt.Errorf("failed to query Prometheus: %w", err) + } + + if result.String() == "" { + *t = time.Now() + return errors.New("empty result") + } + + return nil + } + } + + getFirstValueOfVector := func(query string) float64 { + result, err := promInstance.Query(query) + Expect(err).ToNot(HaveOccurred()) + + val, err := framework.GetFirstValueOfPrometheusVector(result) + Expect(err).ToNot(HaveOccurred()) + + return val + } + + getBuckets := func(query string) []bucket { + result, err := promInstance.Query(query) + Expect(err).ToNot(HaveOccurred()) + + res, ok := result.(model.Vector) + Expect(ok).To(BeTrue()) + + buckets := make([]bucket, 0, len(res)) + + for _, sample := range res { + le := sample.Metric["le"] + val := float64(sample.Value) + bucket := bucket{ + Le: string(le), + Val: int(val), + } + buckets = append(buckets, bucket) + } + + return buckets + } + + checkLogErrors := func( + containerName string, + substrings []string, + ignoredSubstrings []string, + fileName string, + ) int { + logs, err := resourceManager.GetPodLogs(ngfNamespace, ngfPodName, &core.PodLogOptions{ + Container: containerName, + }) + Expect(err).ToNot(HaveOccurred()) + + logLines := strings.Split(logs, "\n") + errors := 0 + + outer: + for _, line := range logLines { + for _, substr := range ignoredSubstrings { + if strings.Contains(line, substr) { + continue outer + } + } + for _, substr := range substrings { + if strings.Contains(line, substr) { + errors++ + continue outer + } + } + } + + // attach full logs + if errors > 0 { + f, err := os.Create(fileName) + Expect(err).ToNot(HaveOccurred()) + defer f.Close() + + _, err = io.WriteString(f, logs) + Expect(err).ToNot(HaveOccurred()) + } + return errors + } + + runTestWithMetricsAndLogs := func(testName, testResultsDir string, test func()) { + var ( + metricExistTimeout = 2 * time.Minute + metricExistPolling = 1 * time.Second + ) + + startTime := time.Now() + + // We need to make sure that for the startTime, the metrics exists in Prometheus. + // if they don't exist, we increase the startTime and try again. + // Note: it's important that Polling interval in Eventually is greater than the startTime increment. + + getStartTime := func() time.Time { return startTime } + modifyStartTime := func() { startTime = startTime.Add(500 * time.Millisecond) } + + queries := []string{ + fmt.Sprintf(`container_memory_usage_bytes{pod="%s",container="nginx-gateway"}`, ngfPodName), + fmt.Sprintf(`container_cpu_usage_seconds_total{pod="%s",container="nginx-gateway"}`, ngfPodName), + // We don't need to check all nginx_gateway_fabric_* metrics, as they are collected at the same time + fmt.Sprintf(`nginx_gateway_fabric_nginx_reloads_total{pod="%s"}`, ngfPodName), + } + + for _, q := range queries { + Eventually( + createMetricExistChecker( + q, + getStartTime, + modifyStartTime, + ), + ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) + } + + test() + + // We sleep for 2 scape intervals to ensure that Prometheus scrapes the metrics after the test() finishes + // before endTime, so that we don't lose any metric values like reloads. + time.Sleep(2 * scrapeInterval) + + endTime := time.Now() + + // Now we check that Prometheus has the metrics for the endTime + + // If the test duration is small (which can happen if you run the test with small number of resources), + // the rate query may not return any data. + // To ensure it returns data, we increase the startTime. + Eventually( + createEndTimeFinder( + fmt.Sprintf(`rate(container_cpu_usage_seconds_total{pod="%s",container="nginx-gateway"}[2m])`, ngfPodName), + startTime, + &endTime, + ), + ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) + + getEndTime := func() time.Time { return endTime } + noOpModifier := func() {} + + queries = []string{ + fmt.Sprintf(`container_memory_usage_bytes{pod="%s",container="nginx-gateway"}`, ngfPodName), + // We don't need to check all nginx_gateway_fabric_* metrics, as they are collected at the same time + fmt.Sprintf(`nginx_gateway_fabric_nginx_reloads_total{pod="%s"}`, ngfPodName), + } + + for _, q := range queries { + Eventually( + createMetricExistChecker( + q, + getEndTime, + noOpModifier, + ), + ).WithTimeout(metricExistTimeout).WithPolling(metricExistPolling).Should(Succeed()) + } + + // Collect metric values + // For some metrics, generate PNGs + + result, err := promInstance.QueryRange( + fmt.Sprintf(`container_memory_usage_bytes{pod="%s",container="nginx-gateway"}`, ngfPodName), + v1.Range{ + Start: startTime, + End: endTime, + Step: queryRangeStep, + }, + ) + Expect(err).ToNot(HaveOccurred()) + + memCSV := filepath.Join(testResultsDir, framework.CreateResultsFilename("csv", "memory", *plusEnabled)) + Expect(framework.WritePrometheusMatrixToCSVFile(memCSV, result)).To(Succeed()) + + memPNG := framework.CreateResultsFilename("png", "memory", *plusEnabled) + Expect( + framework.GenerateMemoryPNG(testResultsDir, memCSV, memPNG), + ).To(Succeed()) + + Expect(os.Remove(memCSV)).To(Succeed()) + + result, err = promInstance.QueryRange( + fmt.Sprintf(`rate(container_cpu_usage_seconds_total{pod="%s",container="nginx-gateway"}[2m])`, ngfPodName), + v1.Range{ + Start: startTime, + End: endTime, + Step: queryRangeStep, + }, + ) + Expect(err).ToNot(HaveOccurred()) + + cpuCSV := filepath.Join(testResultsDir, framework.CreateResultsFilename("csv", "cpu", *plusEnabled)) + Expect(framework.WritePrometheusMatrixToCSVFile(cpuCSV, result)).To(Succeed()) + + cpuPNG := framework.CreateResultsFilename("png", "cpu", *plusEnabled) + Expect( + framework.GenerateCPUPNG(testResultsDir, cpuCSV, cpuPNG), + ).To(Succeed()) + + Expect(os.Remove(cpuCSV)).To(Succeed()) + + reloadCount := getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + ) + + reloadErrsCount := getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reload_errors_total{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reload_errors_total{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + ) + + reloadAvgTime := getFirstValueOfVector( + fmt.Sprintf( + `(nginx_gateway_fabric_nginx_reloads_milliseconds_sum{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_milliseconds_sum{pod="%[1]s"} @ %[2]d)`+ + ` / `+ + `(nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_total{pod="%[1]s"} @ %[2]d)`, + ngfPodName, + startTime.Unix(), + )) + + reloadBuckets := getBuckets( + fmt.Sprintf( + `nginx_gateway_fabric_nginx_reloads_milliseconds_bucket{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_nginx_reloads_milliseconds_bucket{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + ) + + eventsCount := getFirstValueOfVector( + fmt.Sprintf( + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + ) + + eventsAvgTime := getFirstValueOfVector( + fmt.Sprintf( + `(nginx_gateway_fabric_event_batch_processing_milliseconds_sum{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_sum{pod="%[1]s"} @ %[2]d)`+ + ` / `+ + `(nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_count{pod="%[1]s"} @ %[2]d)`, + ngfPodName, + startTime.Unix(), + ), + ) + + eventsBuckets := getBuckets( + fmt.Sprintf( + `nginx_gateway_fabric_event_batch_processing_milliseconds_bucket{pod="%[1]s"}`+ + ` - `+ + `nginx_gateway_fabric_event_batch_processing_milliseconds_bucket{pod="%[1]s"} @ %d`, + ngfPodName, + startTime.Unix(), + ), + ) + + // Check container logs for errors + + ngfErrors := checkLogErrors( + "nginx-gateway", + []string{"error"}, + []string{`"logger":"usageReporter`}, // ignore usageReporter errors + filepath.Join(testResultsDir, framework.CreateResultsFilename("log", "ngf", *plusEnabled)), + ) + nginxErrors := checkLogErrors( + "nginx", + []string{"[error]", "[emerg]", "[crit]", "[alert]"}, + nil, + filepath.Join(testResultsDir, framework.CreateResultsFilename("log", "nginx", *plusEnabled)), + ) + + // Check container restarts + + pod, err := resourceManager.GetPod(ngfNamespace, ngfPodName) + Expect(err).ToNot(HaveOccurred()) + + findRestarts := func(name string) int { + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.Name == name { + return int(containerStatus.RestartCount) + } + } + Fail(fmt.Sprintf("container %s not found", name)) + return 0 + } + + ngfRestarts := findRestarts("nginx-gateway") + nginxRestarts := findRestarts("nginx") + + // Write results + + results := scaleTestResults{ + Name: testName, + ReloadCount: int(reloadCount), + ReloadErrsCount: int(reloadErrsCount), + ReloadAvgTime: int(reloadAvgTime), + ReloadBuckets: reloadBuckets, + EventsCount: int(eventsCount), + EventsAvgTime: int(eventsAvgTime), + EventsBuckets: eventsBuckets, + NGFErrors: ngfErrors, + NginxErrors: nginxErrors, + NGFContainerRestarts: ngfRestarts, + NginxContainerRestarts: nginxRestarts, + } + + err = writeScaleResults(outFile, results) + Expect(err).ToNot(HaveOccurred()) + } + + runScaleResources := func(objects framework.ScaleObjects, testResultsDir string, protocol string) { + ttrCsvFileName := framework.CreateResultsFilename("csv", "ttr", *plusEnabled) + ttrCsvFile, writer, err := framework.NewCSVResultsWriter(testResultsDir, ttrCsvFileName) + Expect(err).ToNot(HaveOccurred()) + defer ttrCsvFile.Close() + + Expect(resourceManager.Apply(objects.BaseObjects)).To(Succeed()) + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + + Expect(resourceManager.WaitForPodsToBeReady(ctx, namespace)).To(Succeed()) + + for i := 0; i < len(objects.ScaleIterationGroups); i++ { + Expect(resourceManager.Apply(objects.ScaleIterationGroups[i])).To(Succeed()) + + var url string + if protocol == "http" && portFwdPort != 0 { + url = fmt.Sprintf("%s://%d.example.com:%d", protocol, i, portFwdPort) + } else if protocol == "https" && portFwdHTTPSPort != 0 { + url = fmt.Sprintf("%s://%d.example.com:%d", protocol, i, portFwdHTTPSPort) + } else { + url = fmt.Sprintf("%s://%d.example.com", protocol, i) + } + + startCheck := time.Now() + + Eventually( + createResponseChecker(url, address), + ).WithTimeout(30 * time.Second).WithPolling(100 * time.Millisecond).Should(Succeed()) + + ttr := time.Since(startCheck) + + seconds := ttr.Seconds() + record := []string{strconv.Itoa(i + 1), strconv.FormatFloat(seconds, 'f', -1, 64)} + + Expect(writer.Write(record)).To(Succeed()) + } + + writer.Flush() + Expect(ttrCsvFile.Close()).To(Succeed()) + + ttrPNG := framework.CreateResultsFilename("png", "ttr", *plusEnabled) + Expect( + framework.GenerateTTRPNG(testResultsDir, ttrCsvFile.Name(), ttrPNG), + ).To(Succeed()) + + Expect(os.Remove(ttrCsvFile.Name())).To(Succeed()) + } + + runScaleUpstreams := func() { + Expect(resourceManager.ApplyFromFiles(upstreamsManifests, namespace)).To(Succeed()) + Expect(resourceManager.WaitForAppsToBeReady(namespace)).To(Succeed()) + + var url string + if portFwdPort != 0 { + url = fmt.Sprintf("http://hello.example.com:%d", portFwdPort) + } else { + url = "http://hello.example.com" + } + + Eventually( + createResponseChecker(url, address), + ).WithTimeout(5 * time.Second).WithPolling(100 * time.Millisecond).Should(Succeed()) + + Expect( + resourceManager.ScaleDeployment(namespace, "backend", upstreamServerCount), + ).To(Succeed()) + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute) + defer cancel() + + Expect(resourceManager.WaitForPodsToBeReady(ctx, namespace)).To(Succeed()) + + Eventually( + createResponseChecker(url, address), + ).WithTimeout(5 * time.Second).WithPolling(100 * time.Millisecond).Should(Succeed()) + } + + setNamespace := func(objects framework.ScaleObjects) { + for _, obj := range objects.BaseObjects { + obj.SetNamespace(namespace) + } + for _, objs := range objects.ScaleIterationGroups { + for _, obj := range objs { + obj.SetNamespace(namespace) + } + } + } + + It(fmt.Sprintf("scales HTTP listeners to %d", httpListenerCount), func() { + const testName = "TestScale_Listeners" + + testResultsDir := filepath.Join(resultsDir, testName) + Expect(os.MkdirAll(testResultsDir, 0755)).To(Succeed()) + + objects, err := framework.GenerateScaleListenerObjects(httpListenerCount, false /*non-tls*/) + Expect(err).ToNot(HaveOccurred()) + + setNamespace(objects) + + runTestWithMetricsAndLogs( + testName, + testResultsDir, + func() { + runScaleResources( + objects, + testResultsDir, + "http", + ) + }, + ) + }) + + It(fmt.Sprintf("scales HTTPS listeners to %d", httpsListenerCount), func() { + const testName = "TestScale_HTTPSListeners" + + testResultsDir := filepath.Join(resultsDir, testName) + Expect(os.MkdirAll(testResultsDir, 0o755)).To(Succeed()) + + objects, err := framework.GenerateScaleListenerObjects(httpsListenerCount, true /*tls*/) + Expect(err).ToNot(HaveOccurred()) + + setNamespace(objects) + + runTestWithMetricsAndLogs( + testName, + testResultsDir, + func() { + runScaleResources( + objects, + testResultsDir, + "https", + ) + }, + ) + }) + + It(fmt.Sprintf("scales HTTP routes to %d", httpRouteCount), func() { + const testName = "TestScale_HTTPRoutes" + + testResultsDir := filepath.Join(resultsDir, testName) + Expect(os.MkdirAll(testResultsDir, 0o755)).To(Succeed()) + + objects, err := framework.GenerateScaleHTTPRouteObjects(httpRouteCount) + Expect(err).ToNot(HaveOccurred()) + + setNamespace(objects) + + runTestWithMetricsAndLogs( + testName, + testResultsDir, + func() { + runScaleResources( + objects, + testResultsDir, + "http", + ) + }, + ) + }) + + It(fmt.Sprintf("scales upstream servers to %d", upstreamServerCount), func() { + const testName = "TestScale_UpstreamServers" + + testResultsDir := filepath.Join(resultsDir, testName) + Expect(os.MkdirAll(testResultsDir, 0o755)).To(Succeed()) + + runTestWithMetricsAndLogs( + testName, + testResultsDir, + func() { + runScaleUpstreams() + }, + ) + }) + + It("scales HTTP matches", func() { + const testName = "TestScale_HTTPMatches" + + Expect(resourceManager.ApplyFromFiles(matchesManifests, namespace)).To(Succeed()) + Expect(resourceManager.WaitForAppsToBeReady(namespace)).To(Succeed()) + + var port int + if portFwdPort != 0 { + port = portFwdPort + } else { + port = 80 + } + + addr := fmt.Sprintf("%s:%d", address, port) + + baseURL := "http://cafe.example.com" + + text := fmt.Sprintf("\n## Test %s\n\n", testName) + + _, err := fmt.Fprint(outFile, text) + Expect(err).ToNot(HaveOccurred()) + + run := func(t framework.Target) { + cfg := framework.LoadTestConfig{ + Targets: []framework.Target{t}, + Rate: 1000, + Duration: 30 * time.Second, + Description: "First matches", + Proxy: addr, + ServerName: "cafe.example.com", + } + _, metrics := framework.RunLoadTest(cfg) + + _, err = fmt.Fprintln(outFile, "```text") + Expect(err).ToNot(HaveOccurred()) + Expect(framework.WriteMetricsResults(outFile, &metrics)).To(Succeed()) + _, err = fmt.Fprintln(outFile, "```") + Expect(err).ToNot(HaveOccurred()) + } + + run(framework.Target{ + Method: "GET", + URL: fmt.Sprintf("%s%s", baseURL, "/latte"), + Header: map[string][]string{ + "header-1": {"header-1-val"}, + }, + }) + + run(framework.Target{ + Method: "GET", + URL: fmt.Sprintf("%s%s", baseURL, "/latte"), + Header: map[string][]string{ + "header-50": {"header-50-val"}, + }, + }) + }) + + AfterEach(func() { + teardown(releaseName) + Expect(resourceManager.DeleteNamespace(namespace)).To(Succeed()) + }) + + AfterAll(func() { + close(promPortForwardStopCh) + Expect(framework.UninstallPrometheus(resourceManager)).To(Succeed()) + Expect(outFile.Close()).To(Succeed()) + + // restoring NGF shared among tests in the suite + cfg := getDefaultSetupCfg() + cfg.nfr = true + setup(cfg) + }) +}) + +type bucket struct { + Le string + Val int +} + +type scaleTestResults struct { + Name string + + ReloadCount int + ReloadErrsCount int + ReloadAvgTime int + ReloadBuckets []bucket + + EventsCount int + EventsAvgTime int + EventsBuckets []bucket + + NGFErrors int + NginxErrors int + + NGFContainerRestarts int + NginxContainerRestarts int +} + +const scaleResultTemplate = ` +## Test {{ .Name }} + +### Reloads + +- Total: {{ .ReloadCount }} +- Total Errors: {{ .ReloadErrsCount }} +- Average Time: {{ .ReloadAvgTime }}ms +- Reload distribution: +{{- range .ReloadBuckets }} + - {{ .Le }}ms: {{ .Val }} +{{- end }} + +### Event Batch Processing + +- Total: {{ .EventsCount }} +- Average Time: {{ .EventsAvgTime }}ms +- Event Batch Processing distribution: +{{- range .EventsBuckets }} + - {{ .Le }}ms: {{ .Val }} +{{- end }} + +### Errors + +- NGF errors: {{ .NGFErrors }} +- NGF container restarts: {{ .NGFContainerRestarts }} +- NGINX errors: {{ .NginxErrors }} +- NGINX container restarts: {{ .NginxContainerRestarts }} + +### Graphs and Logs + +See [output directory](./{{ .Name }}) for more details. +The logs are attached only if there are errors. +` + +func writeScaleResults(dest io.Writer, results scaleTestResults) error { + tmpl, err := template.New("results").Parse(scaleResultTemplate) + if err != nil { + return err + } + + return tmpl.Execute(dest, results) +} diff --git a/tests/suite/system_suite_test.go b/tests/suite/system_suite_test.go index 7c1218d15f..0dd0b3f4bb 100644 --- a/tests/suite/system_suite_test.go +++ b/tests/suite/system_suite_test.go @@ -4,6 +4,7 @@ import ( "context" "embed" "flag" + "fmt" "path" "path/filepath" "runtime" @@ -64,8 +65,9 @@ var ( manifests embed.FS k8sClient client.Client resourceManager framework.ResourceManager - portForwardStopCh = make(chan struct{}, 1) + portForwardStopCh chan struct{} portFwdPort int + portFwdHTTPSPort int timeoutConfig framework.TimeoutConfig localChartPath string address string @@ -75,8 +77,10 @@ var ( ) const ( - releaseName = "ngf-test" - ngfNamespace = "nginx-gateway" + releaseName = "ngf-test" + ngfNamespace = "nginx-gateway" + ngfHTTPForwardedPort = 10080 + ngfHTTPSForwardedPort = 10443 ) type setupConfig struct { @@ -179,8 +183,12 @@ func setup(cfg setupConfig, extraInstallArgs ...string) { Expect(podNames).ToNot(BeEmpty()) if *serviceType != "LoadBalancer" { - portFwdPort, err = framework.PortForward(k8sConfig, installCfg.Namespace, podNames[0], portForwardStopCh) + ports := []string{fmt.Sprintf("%d:80", ngfHTTPForwardedPort), fmt.Sprintf("%d:443", ngfHTTPSForwardedPort)} + portForwardStopCh = make(chan struct{}) + err = framework.PortForward(k8sConfig, installCfg.Namespace, podNames[0], ports, portForwardStopCh) address = "127.0.0.1" + portFwdPort = ngfHTTPForwardedPort + portFwdHTTPSPort = ngfHTTPSForwardedPort } else { address, err = resourceManager.GetLBIPAddress(installCfg.Namespace) } @@ -189,7 +197,9 @@ func setup(cfg setupConfig, extraInstallArgs ...string) { func teardown(relName string) { if portFwdPort != 0 { - portForwardStopCh <- struct{}{} + close(portForwardStopCh) + portFwdPort = 0 + portFwdHTTPSPort = 0 } cfg := framework.InstallationConfig{ @@ -242,15 +252,18 @@ var _ = BeforeSuite(func() { cfg.nfr = isNFR(labelFilter) // Skip deployment if: - // - running upgrade test (this test will deploy its own version) - // - running longevity teardown (deployment will already exist) - // - running telemetry test (NGF will be deployed as part of the test) - if strings.Contains(labelFilter, "upgrade") || - strings.Contains(labelFilter, "longevity-teardown") || - strings.Contains(labelFilter, "telemetry") || - strings.Contains(labelFilter, "graceful-recovery") { - - cfg.deploy = false + skipSubstrings := []string{ + "upgrade", // - running upgrade test (this test will deploy its own version) + "longevity-teardown", // - running longevity teardown (deployment will already exist) + "telemetry", // - running telemetry test (NGF will be deployed as part of the test) + "graceful-recovery", // - running graceful recovery test (this test will deploy its own version) + "scale", // - running scale test (this test will deploy its own version) + } + for _, s := range skipSubstrings { + if strings.Contains(labelFilter, s) { + cfg.deploy = false + break + } } // use a different release name for longevity to allow us to filter on a specific label when collecting @@ -283,5 +296,6 @@ func isNFR(labelFilter string) bool { strings.Contains(labelFilter, "longevity") || strings.Contains(labelFilter, "performance") || strings.Contains(labelFilter, "upgrade") || - strings.Contains(labelFilter, "graceful-recovery") + strings.Contains(labelFilter, "graceful-recovery") || + strings.Contains(labelFilter, "scale") } diff --git a/tests/suite/upgrade_test.go b/tests/suite/upgrade_test.go index 09b3ecb8cf..e749292078 100644 --- a/tests/suite/upgrade_test.go +++ b/tests/suite/upgrade_test.go @@ -76,7 +76,7 @@ var _ = Describe("Upgrade testing", Label("nfr", "upgrade"), func() { AfterEach(func() { Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) - Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) + Expect(resourceManager.DeleteNamespace(ns.Name)).To(Succeed()) resultsFile.Close() }) @@ -157,7 +157,7 @@ var _ = Describe("Upgrade testing", Label("nfr", "upgrade"), func() { } buf := new(bytes.Buffer) - encoder := framework.NewCSVEncoder(buf) + encoder := framework.NewVegetaCSVEncoder(buf) for _, res := range results { res := res Expect(encoder.Encode(&res)).To(Succeed()) @@ -173,8 +173,9 @@ var _ = Describe("Upgrade testing", Label("nfr", "upgrade"), func() { csvFile.Close() pngName := framework.CreateResultsFilename("png", scheme, *plusEnabled) - output, err := framework.GeneratePNG(resultsDir, csvName, pngName) - Expect(err).ToNot(HaveOccurred(), string(output)) + Expect( + framework.GenerateRequestsPNG(resultsDir, csvName, pngName), + ).To(Succeed()) metricsCh <- &metricsRes }(test) @@ -251,7 +252,7 @@ var _ = Describe("Upgrade testing", Label("nfr", "upgrade"), func() { _, err := fmt.Fprint(resultsFile, res.testName) Expect(err).ToNot(HaveOccurred()) - Expect(framework.WriteResults(resultsFile, res.metrics)).To(Succeed()) + Expect(framework.WriteMetricsResults(resultsFile, res.metrics)).To(Succeed()) link := fmt.Sprintf("\n\n![%[1]v.png](%[1]v.png)\n", res.scheme) if *plusEnabled {