Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

e2e: fix ZipkinTracing flaky #3899

Merged
merged 9 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/gateway-addons-helm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ To uninstall the chart:
| fluent-bit.config.service | string | `"[SERVICE]\n Daemon Off\n Flush {{ .Values.flush }}\n Log_Level {{ .Values.logLevel }}\n Parsers_File parsers.conf\n Parsers_File custom_parsers.conf\n HTTP_Server On\n HTTP_Listen 0.0.0.0\n HTTP_Port {{ .Values.metricsPort }}\n Health_Check On\n"` | |
| fluent-bit.enabled | bool | `true` | |
| fluent-bit.fullnameOverride | string | `"fluent-bit"` | |
| fluent-bit.image.repository | string | `"fluent/fluent-bit"` | |
| fluent-bit.podAnnotations."fluentbit.io/exclude" | string | `"true"` | |
| fluent-bit.podAnnotations."prometheus.io/path" | string | `"/api/v1/metrics/prometheus"` | |
| fluent-bit.podAnnotations."prometheus.io/port" | string | `"2020"` | |
Expand Down
2 changes: 2 additions & 0 deletions charts/gateway-addons-helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ prometheus:
# Values for Fluent-bit dependency
fluent-bit:
enabled: true
image:
repository: fluent/fluent-bit # use image from dockerhub
fullnameOverride: fluent-bit
testFramework:
enabled: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ An Add-ons Helm chart for Envoy Gateway
| fluent-bit.config.service | string | `"[SERVICE]\n Daemon Off\n Flush {{ .Values.flush }}\n Log_Level {{ .Values.logLevel }}\n Parsers_File parsers.conf\n Parsers_File custom_parsers.conf\n HTTP_Server On\n HTTP_Listen 0.0.0.0\n HTTP_Port {{ .Values.metricsPort }}\n Health_Check On\n"` | |
| fluent-bit.enabled | bool | `true` | |
| fluent-bit.fullnameOverride | string | `"fluent-bit"` | |
| fluent-bit.image.repository | string | `"fluent/fluent-bit"` | |
| fluent-bit.podAnnotations."fluentbit.io/exclude" | string | `"true"` | |
| fluent-bit.podAnnotations."prometheus.io/path" | string | `"/api/v1/metrics/prometheus"` | |
| fluent-bit.podAnnotations."prometheus.io/port" | string | `"2020"` | |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ An Add-ons Helm chart for Envoy Gateway
| fluent-bit.config.service | string | `"[SERVICE]\n Daemon Off\n Flush {{ .Values.flush }}\n Log_Level {{ .Values.logLevel }}\n Parsers_File parsers.conf\n Parsers_File custom_parsers.conf\n HTTP_Server On\n HTTP_Listen 0.0.0.0\n HTTP_Port {{ .Values.metricsPort }}\n Health_Check On\n"` | |
| fluent-bit.enabled | bool | `true` | |
| fluent-bit.fullnameOverride | string | `"fluent-bit"` | |
| fluent-bit.image.repository | string | `"fluent/fluent-bit"` | |
| fluent-bit.podAnnotations."fluentbit.io/exclude" | string | `"true"` | |
| fluent-bit.podAnnotations."prometheus.io/path" | string | `"/api/v1/metrics/prometheus"` | |
| fluent-bit.podAnnotations."prometheus.io/port" | string | `"2020"` | |
Expand Down
5 changes: 0 additions & 5 deletions test/e2e/base/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ spec:
allowedRoutes:
namespaces:
from: All
infrastructure:
parametersRef:
group: gateway.envoyproxy.io
kind: EnvoyProxy
name: zipkin-tracing
---
apiVersion: v1
kind: Service
Expand Down
108 changes: 22 additions & 86 deletions test/e2e/tests/tracing.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,11 @@ package tests
import (
"context"
"fmt"
"net"
"net/http"
"net/url"
"strings"
"testing"
"time"

"github.com/go-logfmt/logfmt"
"github.com/gogo/protobuf/jsonpb" // nolint: depguard // tempopb use gogo/protobuf
"github.com/grafana/tempo/pkg/tempopb"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"sigs.k8s.io/controller-runtime/pkg/client"
httputils "sigs.k8s.io/gateway-api/conformance/utils/http"
"sigs.k8s.io/gateway-api/conformance/utils/kubernetes"
"sigs.k8s.io/gateway-api/conformance/utils/suite"
Expand Down Expand Up @@ -114,94 +105,39 @@ var ZipkinTracingTest = suite.ConformanceTest{
// should make them kept consistent
"service.name": fmt.Sprintf("%s/%s", gwNN.Namespace, gwNN.Name),
}
// let's wait for the log to be sent to stdout
if err := wait.PollUntilContextTimeout(context.TODO(), time.Second, time.Minute, true,
func(ctx context.Context) (bool, error) {
count, err := QueryTraceFromTempo(t, suite.Client, tags)
preCount, err := QueryTraceFromTempo(t, suite.Client, tags)
if err != nil {
t.Logf("failed to get trace count from tempo: %v", err)
return false, nil
}

if count > 0 {
return true, nil
httputils.MakeRequestAndExpectEventuallyConsistentResponse(t, suite.RoundTripper, suite.TimeoutConfig, gwAddr, expectedResponse)

// looks like we need almost 15 seconds to get the trace from Tempo?
err = wait.PollUntilContextTimeout(context.TODO(), time.Second, 15*time.Second, true, func(ctx context.Context) (done bool, err error) {
curCount, err := QueryTraceFromTempo(t, suite.Client, tags)
if err != nil {
t.Logf("failed to get curCount count from tempo: %v", err)
return false, nil
}

if curCount > preCount {
return true, nil
}

return false, nil
})
if err != nil {
t.Logf("failed to get current count from tempo: %v", err)
return false, nil
}
return false, nil

return true, nil
}); err != nil {
t.Errorf("failed to get trace from tempo: %v", err)
}
})
},
}

// QueryTraceFromTempo queries span count from tempo
// TODO: move to utils package if needed
func QueryTraceFromTempo(t *testing.T, c client.Client, tags map[string]string) (int, error) {
svc := corev1.Service{}
if err := c.Get(context.Background(), types.NamespacedName{
Namespace: "monitoring",
Name: "tempo",
}, &svc); err != nil {
return -1, err
}
host := ""
for _, ing := range svc.Status.LoadBalancer.Ingress {
if ing.IP != "" {
host = ing.IP
break
}
}

tagsQueryParam, err := createTagsQueryParam(tags)
if err != nil {
return -1, err
}

tempoURL := url.URL{
Scheme: "http",
Host: net.JoinHostPort(host, "3100"),
Path: "/api/search",
}
query := tempoURL.Query()
query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) // query traces from last 10 minutes
query.Add("end", fmt.Sprintf("%d", time.Now().Unix()))
query.Add("tags", tagsQueryParam)
tempoURL.RawQuery = query.Encode()

req, err := http.NewRequest("GET", tempoURL.String(), nil)
if err != nil {
return -1, err
}

t.Logf("send request to %s", tempoURL.String())
res, err := http.DefaultClient.Do(req)
if err != nil {
return -1, err
}

if res.StatusCode != http.StatusOK {
return -1, fmt.Errorf("failed to query tempo, url=%s, status=%s", tempoURL.String(), res.Status)
}

tempoResponse := &tempopb.SearchResponse{}
if err := jsonpb.Unmarshal(res.Body, tempoResponse); err != nil {
return -1, err
}

total := len(tempoResponse.Traces)
t.Logf("get response from tempo, url=%s, response=%v, total=%d", tempoURL.String(), tempoResponse, total)
return total, nil
}

// copy from https://github.com/grafana/tempo/blob/c0127c78c368319433c7c67ca8967adbfed2259e/cmd/tempo-query/tempo/plugin.go#L361
func createTagsQueryParam(tags map[string]string) (string, error) {
tagsBuilder := &strings.Builder{}
tagsEncoder := logfmt.NewEncoder(tagsBuilder)
for k, v := range tags {
err := tagsEncoder.EncodeKeyval(k, v)
if err != nil {
return "", err
}
}
return tagsBuilder.String(), nil
}
75 changes: 75 additions & 0 deletions test/e2e/tests/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"errors"
"fmt"
"io"
"net"
"net/http"
"net/url"
"strings"
Expand All @@ -20,7 +21,10 @@ import (
"fortio.org/fortio/fhttp"
"fortio.org/fortio/periodic"
flog "fortio.org/log"
"github.com/go-logfmt/logfmt"
"github.com/gogo/protobuf/jsonpb" // nolint: depguard // tempopb use gogo/protobuf
"github.com/google/go-cmp/cmp"
"github.com/grafana/tempo/pkg/tempopb"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -494,3 +498,74 @@ type LokiQueryResponse struct {
}
}
}

// QueryTraceFromTempo queries span count from tempo
func QueryTraceFromTempo(t *testing.T, c client.Client, tags map[string]string) (int, error) {
svc := corev1.Service{}
if err := c.Get(context.Background(), types.NamespacedName{
Namespace: "monitoring",
Name: "tempo",
}, &svc); err != nil {
return -1, err
}
host := ""
for _, ing := range svc.Status.LoadBalancer.Ingress {
if ing.IP != "" {
host = ing.IP
break
}
}

tagsQueryParam, err := createTagsQueryParam(tags)
if err != nil {
return -1, err
}

tempoURL := url.URL{
Scheme: "http",
Host: net.JoinHostPort(host, "3100"),
Path: "/api/search",
}
query := tempoURL.Query()
query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) // query traces from last 10 minutes
query.Add("end", fmt.Sprintf("%d", time.Now().Unix()))
query.Add("tags", tagsQueryParam)
tempoURL.RawQuery = query.Encode()

req, err := http.NewRequest("GET", tempoURL.String(), nil)
if err != nil {
return -1, err
}

t.Logf("send request to %s", tempoURL.String())
res, err := http.DefaultClient.Do(req)
if err != nil {
return -1, err
}

if res.StatusCode != http.StatusOK {
return -1, fmt.Errorf("failed to query tempo, url=%s, status=%s", tempoURL.String(), res.Status)
}

tempoResponse := &tempopb.SearchResponse{}
if err := jsonpb.Unmarshal(res.Body, tempoResponse); err != nil {
return -1, err
}

total := len(tempoResponse.Traces)
t.Logf("get response from tempo, url=%s, response=%v, total=%d", tempoURL.String(), tempoResponse, total)
return total, nil
}

// copy from https://github.com/grafana/tempo/blob/c0127c78c368319433c7c67ca8967adbfed2259e/cmd/tempo-query/tempo/plugin.go#L361
func createTagsQueryParam(tags map[string]string) (string, error) {
tagsBuilder := &strings.Builder{}
tagsEncoder := logfmt.NewEncoder(tagsBuilder)
for k, v := range tags {
err := tagsEncoder.EncodeKeyval(k, v)
if err != nil {
return "", err
}
}
return tagsBuilder.String(), nil
}
2 changes: 1 addition & 1 deletion test/helm/gateway-addons-helm/default.out.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9059,7 +9059,7 @@ spec:
dnsPolicy: ClusterFirst
containers:
- name: fluent-bit
image: "cr.fluentbit.io/fluent/fluent-bit:2.1.4"
image: "fluent/fluent-bit:2.1.4"
imagePullPolicy: Always
ports:
- name: http
Expand Down
2 changes: 1 addition & 1 deletion test/helm/gateway-addons-helm/e2e.out.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9048,7 +9048,7 @@ spec:
dnsPolicy: ClusterFirst
containers:
- name: fluent-bit
image: "cr.fluentbit.io/fluent/fluent-bit:2.1.4"
image: "fluent/fluent-bit:2.1.4"
imagePullPolicy: Always
ports:
- name: http
Expand Down
7 changes: 7 additions & 0 deletions tools/make/kube.mk
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,13 @@ install-e2e-telemetry: helm-generate.gateway-addons-helm
helm upgrade -i eg-addons charts/gateway-addons-helm --set grafana.enabled=false,opentelemetry-collector.enabled=true -n monitoring --create-namespace --timeout='$(WAIT_TIMEOUT)' --wait --wait-for-jobs
# Change loki service type from ClusterIP to LoadBalancer
kubectl patch service loki -n monitoring -p '{"spec": {"type": "LoadBalancer"}}'
# Wait service Ready
kubectl rollout status --watch --timeout=5m -n monitoring deployment/prometheus
kubectl rollout status --watch --timeout=5m statefulset/loki -n monitoring
kubectl rollout status --watch --timeout=5m statefulset/tempo -n monitoring
# Restart otel-collector to make sure otlp exporter worked
kubectl rollout restart -n monitoring deployment/otel-collector
kubectl rollout status --watch --timeout=5m -n monitoring deployment/otel-collector

.PHONY: uninstall-e2e-telemetry
uninstall-e2e-telemetry:
Expand Down