Skip to content

Commit

Permalink
e2e: fix ZipkinTracing flaky (envoyproxy#3899)
Browse files Browse the repository at this point in the history
* e2e: make sure OTel-collector is ready

Signed-off-by: zirain <[email protected]>

* fix gen

Signed-off-by: zirain <[email protected]>

* fix retry

Signed-off-by: zirain <[email protected]>

* remove infrastructure.parametersRef from all-namespace

Signed-off-by: zirain <[email protected]>

* update

Signed-off-by: zirain <[email protected]>

* update

Signed-off-by: zirain <[email protected]>

* lint

Signed-off-by: zirain <[email protected]>

* fix bad request

Signed-off-by: zirain <[email protected]>

* increase time of one cycle

Signed-off-by: zirain <[email protected]>

---------

Signed-off-by: zirain <[email protected]>
  • Loading branch information
zirain authored and guydc committed Jul 22, 2024
1 parent cd6f64c commit 3481ea5
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 93 deletions.
1 change: 1 addition & 0 deletions charts/gateway-addons-helm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ To uninstall the chart:
| fluent-bit.config.service | string | `"[SERVICE]\n Daemon Off\n Flush {{ .Values.flush }}\n Log_Level {{ .Values.logLevel }}\n Parsers_File parsers.conf\n Parsers_File custom_parsers.conf\n HTTP_Server On\n HTTP_Listen 0.0.0.0\n HTTP_Port {{ .Values.metricsPort }}\n Health_Check On\n"` | |
| fluent-bit.enabled | bool | `true` | |
| fluent-bit.fullnameOverride | string | `"fluent-bit"` | |
| fluent-bit.image.repository | string | `"fluent/fluent-bit"` | |
| fluent-bit.podAnnotations."fluentbit.io/exclude" | string | `"true"` | |
| fluent-bit.podAnnotations."prometheus.io/path" | string | `"/api/v1/metrics/prometheus"` | |
| fluent-bit.podAnnotations."prometheus.io/port" | string | `"2020"` | |
Expand Down
2 changes: 2 additions & 0 deletions charts/gateway-addons-helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ prometheus:
# Values for Fluent-bit dependency
fluent-bit:
enabled: true
image:
repository: fluent/fluent-bit # use image from dockerhub
fullnameOverride: fluent-bit
testFramework:
enabled: false
Expand Down
1 change: 1 addition & 0 deletions site/content/en/latest/install/gateway-addons-helm-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ An Add-ons Helm chart for Envoy Gateway
| fluent-bit.config.service | string | `"[SERVICE]\n Daemon Off\n Flush {{ .Values.flush }}\n Log_Level {{ .Values.logLevel }}\n Parsers_File parsers.conf\n Parsers_File custom_parsers.conf\n HTTP_Server On\n HTTP_Listen 0.0.0.0\n HTTP_Port {{ .Values.metricsPort }}\n Health_Check On\n"` | |
| fluent-bit.enabled | bool | `true` | |
| fluent-bit.fullnameOverride | string | `"fluent-bit"` | |
| fluent-bit.image.repository | string | `"fluent/fluent-bit"` | |
| fluent-bit.podAnnotations."fluentbit.io/exclude" | string | `"true"` | |
| fluent-bit.podAnnotations."prometheus.io/path" | string | `"/api/v1/metrics/prometheus"` | |
| fluent-bit.podAnnotations."prometheus.io/port" | string | `"2020"` | |
Expand Down
1 change: 1 addition & 0 deletions site/content/zh/latest/install/gateway-addons-helm-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ An Add-ons Helm chart for Envoy Gateway
| fluent-bit.config.service | string | `"[SERVICE]\n Daemon Off\n Flush {{ .Values.flush }}\n Log_Level {{ .Values.logLevel }}\n Parsers_File parsers.conf\n Parsers_File custom_parsers.conf\n HTTP_Server On\n HTTP_Listen 0.0.0.0\n HTTP_Port {{ .Values.metricsPort }}\n Health_Check On\n"` | |
| fluent-bit.enabled | bool | `true` | |
| fluent-bit.fullnameOverride | string | `"fluent-bit"` | |
| fluent-bit.image.repository | string | `"fluent/fluent-bit"` | |
| fluent-bit.podAnnotations."fluentbit.io/exclude" | string | `"true"` | |
| fluent-bit.podAnnotations."prometheus.io/path" | string | `"/api/v1/metrics/prometheus"` | |
| fluent-bit.podAnnotations."prometheus.io/port" | string | `"2020"` | |
Expand Down
5 changes: 0 additions & 5 deletions test/e2e/base/manifests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,6 @@ spec:
allowedRoutes:
namespaces:
from: All
infrastructure:
parametersRef:
group: gateway.envoyproxy.io
kind: EnvoyProxy
name: zipkin-tracing
---
apiVersion: v1
kind: Service
Expand Down
108 changes: 22 additions & 86 deletions test/e2e/tests/tracing.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,11 @@ package tests
import (
"context"
"fmt"
"net"
"net/http"
"net/url"
"strings"
"testing"
"time"

"github.com/go-logfmt/logfmt"
"github.com/gogo/protobuf/jsonpb" // nolint: depguard // tempopb use gogo/protobuf
"github.com/grafana/tempo/pkg/tempopb"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"sigs.k8s.io/controller-runtime/pkg/client"
httputils "sigs.k8s.io/gateway-api/conformance/utils/http"
"sigs.k8s.io/gateway-api/conformance/utils/kubernetes"
"sigs.k8s.io/gateway-api/conformance/utils/suite"
Expand Down Expand Up @@ -114,94 +105,39 @@ var ZipkinTracingTest = suite.ConformanceTest{
// should make them kept consistent
"service.name": fmt.Sprintf("%s/%s", gwNN.Namespace, gwNN.Name),
}
// let's wait for the log to be sent to stdout
if err := wait.PollUntilContextTimeout(context.TODO(), time.Second, time.Minute, true,
func(ctx context.Context) (bool, error) {
count, err := QueryTraceFromTempo(t, suite.Client, tags)
preCount, err := QueryTraceFromTempo(t, suite.Client, tags)
if err != nil {
t.Logf("failed to get trace count from tempo: %v", err)
return false, nil
}

if count > 0 {
return true, nil
httputils.MakeRequestAndExpectEventuallyConsistentResponse(t, suite.RoundTripper, suite.TimeoutConfig, gwAddr, expectedResponse)

// looks like we need almost 15 seconds to get the trace from Tempo?
err = wait.PollUntilContextTimeout(context.TODO(), time.Second, 15*time.Second, true, func(ctx context.Context) (done bool, err error) {
curCount, err := QueryTraceFromTempo(t, suite.Client, tags)
if err != nil {
t.Logf("failed to get curCount count from tempo: %v", err)
return false, nil
}

if curCount > preCount {
return true, nil
}

return false, nil
})
if err != nil {
t.Logf("failed to get current count from tempo: %v", err)
return false, nil
}
return false, nil

return true, nil
}); err != nil {
t.Errorf("failed to get trace from tempo: %v", err)
}
})
},
}

// QueryTraceFromTempo queries span count from tempo
// TODO: move to utils package if needed
func QueryTraceFromTempo(t *testing.T, c client.Client, tags map[string]string) (int, error) {
svc := corev1.Service{}
if err := c.Get(context.Background(), types.NamespacedName{
Namespace: "monitoring",
Name: "tempo",
}, &svc); err != nil {
return -1, err
}
host := ""
for _, ing := range svc.Status.LoadBalancer.Ingress {
if ing.IP != "" {
host = ing.IP
break
}
}

tagsQueryParam, err := createTagsQueryParam(tags)
if err != nil {
return -1, err
}

tempoURL := url.URL{
Scheme: "http",
Host: net.JoinHostPort(host, "3100"),
Path: "/api/search",
}
query := tempoURL.Query()
query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) // query traces from last 10 minutes
query.Add("end", fmt.Sprintf("%d", time.Now().Unix()))
query.Add("tags", tagsQueryParam)
tempoURL.RawQuery = query.Encode()

req, err := http.NewRequest("GET", tempoURL.String(), nil)
if err != nil {
return -1, err
}

t.Logf("send request to %s", tempoURL.String())
res, err := http.DefaultClient.Do(req)
if err != nil {
return -1, err
}

if res.StatusCode != http.StatusOK {
return -1, fmt.Errorf("failed to query tempo, url=%s, status=%s", tempoURL.String(), res.Status)
}

tempoResponse := &tempopb.SearchResponse{}
if err := jsonpb.Unmarshal(res.Body, tempoResponse); err != nil {
return -1, err
}

total := len(tempoResponse.Traces)
t.Logf("get response from tempo, url=%s, response=%v, total=%d", tempoURL.String(), tempoResponse, total)
return total, nil
}

// copy from https://github.com/grafana/tempo/blob/c0127c78c368319433c7c67ca8967adbfed2259e/cmd/tempo-query/tempo/plugin.go#L361
func createTagsQueryParam(tags map[string]string) (string, error) {
tagsBuilder := &strings.Builder{}
tagsEncoder := logfmt.NewEncoder(tagsBuilder)
for k, v := range tags {
err := tagsEncoder.EncodeKeyval(k, v)
if err != nil {
return "", err
}
}
return tagsBuilder.String(), nil
}
75 changes: 75 additions & 0 deletions test/e2e/tests/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"errors"
"fmt"
"io"
"net"
"net/http"
"net/url"
"strings"
Expand All @@ -20,7 +21,10 @@ import (
"fortio.org/fortio/fhttp"
"fortio.org/fortio/periodic"
flog "fortio.org/log"
"github.com/go-logfmt/logfmt"
"github.com/gogo/protobuf/jsonpb" // nolint: depguard // tempopb use gogo/protobuf
"github.com/google/go-cmp/cmp"
"github.com/grafana/tempo/pkg/tempopb"
dto "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -494,3 +498,74 @@ type LokiQueryResponse struct {
}
}
}

// QueryTraceFromTempo queries span count from tempo
func QueryTraceFromTempo(t *testing.T, c client.Client, tags map[string]string) (int, error) {
svc := corev1.Service{}
if err := c.Get(context.Background(), types.NamespacedName{
Namespace: "monitoring",
Name: "tempo",
}, &svc); err != nil {
return -1, err
}
host := ""
for _, ing := range svc.Status.LoadBalancer.Ingress {
if ing.IP != "" {
host = ing.IP
break
}
}

tagsQueryParam, err := createTagsQueryParam(tags)
if err != nil {
return -1, err
}

tempoURL := url.URL{
Scheme: "http",
Host: net.JoinHostPort(host, "3100"),
Path: "/api/search",
}
query := tempoURL.Query()
query.Add("start", fmt.Sprintf("%d", time.Now().Add(-10*time.Minute).Unix())) // query traces from last 10 minutes
query.Add("end", fmt.Sprintf("%d", time.Now().Unix()))
query.Add("tags", tagsQueryParam)
tempoURL.RawQuery = query.Encode()

req, err := http.NewRequest("GET", tempoURL.String(), nil)
if err != nil {
return -1, err
}

t.Logf("send request to %s", tempoURL.String())
res, err := http.DefaultClient.Do(req)
if err != nil {
return -1, err
}

if res.StatusCode != http.StatusOK {
return -1, fmt.Errorf("failed to query tempo, url=%s, status=%s", tempoURL.String(), res.Status)
}

tempoResponse := &tempopb.SearchResponse{}
if err := jsonpb.Unmarshal(res.Body, tempoResponse); err != nil {
return -1, err
}

total := len(tempoResponse.Traces)
t.Logf("get response from tempo, url=%s, response=%v, total=%d", tempoURL.String(), tempoResponse, total)
return total, nil
}

// copy from https://github.com/grafana/tempo/blob/c0127c78c368319433c7c67ca8967adbfed2259e/cmd/tempo-query/tempo/plugin.go#L361
func createTagsQueryParam(tags map[string]string) (string, error) {
tagsBuilder := &strings.Builder{}
tagsEncoder := logfmt.NewEncoder(tagsBuilder)
for k, v := range tags {
err := tagsEncoder.EncodeKeyval(k, v)
if err != nil {
return "", err
}
}
return tagsBuilder.String(), nil
}
2 changes: 1 addition & 1 deletion test/helm/gateway-addons-helm/default.out.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9059,7 +9059,7 @@ spec:
dnsPolicy: ClusterFirst
containers:
- name: fluent-bit
image: "cr.fluentbit.io/fluent/fluent-bit:2.1.4"
image: "fluent/fluent-bit:2.1.4"
imagePullPolicy: Always
ports:
- name: http
Expand Down
2 changes: 1 addition & 1 deletion test/helm/gateway-addons-helm/e2e.out.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9048,7 +9048,7 @@ spec:
dnsPolicy: ClusterFirst
containers:
- name: fluent-bit
image: "cr.fluentbit.io/fluent/fluent-bit:2.1.4"
image: "fluent/fluent-bit:2.1.4"
imagePullPolicy: Always
ports:
- name: http
Expand Down
7 changes: 7 additions & 0 deletions tools/make/kube.mk
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,13 @@ install-e2e-telemetry: helm-generate.gateway-addons-helm
helm upgrade -i eg-addons charts/gateway-addons-helm --set grafana.enabled=false,opentelemetry-collector.enabled=true -n monitoring --create-namespace --timeout='$(WAIT_TIMEOUT)' --wait --wait-for-jobs
# Change loki service type from ClusterIP to LoadBalancer
kubectl patch service loki -n monitoring -p '{"spec": {"type": "LoadBalancer"}}'
# Wait service Ready
kubectl rollout status --watch --timeout=5m -n monitoring deployment/prometheus
kubectl rollout status --watch --timeout=5m statefulset/loki -n monitoring
kubectl rollout status --watch --timeout=5m statefulset/tempo -n monitoring
# Restart otel-collector to make sure otlp exporter worked
kubectl rollout restart -n monitoring deployment/otel-collector
kubectl rollout status --watch --timeout=5m -n monitoring deployment/otel-collector

.PHONY: uninstall-e2e-telemetry
uninstall-e2e-telemetry:
Expand Down

0 comments on commit 3481ea5

Please sign in to comment.