From 772d84c3976c8ee688043f0a052d7198de30053e Mon Sep 17 00:00:00 2001 From: Yuri Shkuro Date: Sun, 3 Nov 2024 01:10:01 -0400 Subject: [PATCH] Log time settings in CI (#6140) ## Which problem is this PR solving? - ES v8.0/v2 integration tests often fail ([exampe](https://github.com/jaegertracing/jaeger/actions/runs/11622432732/job/32429220258)), but looks like maybe only during certain hours of the day. E.g. yesterday I rerun the workflow 4 times at night and it repeatedly failed, then in the morning it succeeded. ## Description of the changes - Log time settings in the runner to verify whether it's a time-zone related issue with locating the indices - Add debug logging to ES to indicate which indices are being used for reading / writing (did not notice any issues with that in the failed run above) - Add debug logging to v2 binary configs - Fix a bug in `integration.go` where `require.NoError` was using inside lambda functions passed to `waitForCondition` - it was causing tests to fail on transient error with Elastic. Change it to log the error and return `false` so that `waitForCondition` may continue looping & waiting for successful check. --------- Signed-off-by: Yuri Shkuro --- .github/workflows/ci-e2e-elasticsearch.yml | 5 ++++ cmd/jaeger/config-badger.yaml | 10 +++++++ cmd/jaeger/config-cassandra.yaml | 10 +++++++ cmd/jaeger/config-elasticsearch.yaml | 10 +++++++ cmd/jaeger/config-kafka-collector.yaml | 1 + cmd/jaeger/config-kafka-ingester.yaml | 5 +++- cmd/jaeger/config-opensearch.yaml | 10 +++++++ cmd/jaeger/config-remote-storage.yaml | 10 +++++++ .../config-tail-sampling-always-sample.yaml | 9 +++++- ...fig-tail-sampling-service-name-policy.yaml | 9 +++++- cmd/jaeger/config.yaml | 10 +++++++ cmd/jaeger/internal/all-in-one.yaml | 2 ++ plugin/storage/es/spanstore/reader.go | 26 ++++++++++++---- plugin/storage/es/spanstore/reader_test.go | 5 +++- plugin/storage/es/spanstore/writer.go | 1 + plugin/storage/es/spanstore/writer_test.go | 8 ++--- plugin/storage/integration/integration.go | 30 +++++++++++++++---- scripts/spm-integration-test.sh | 5 +--- 18 files changed, 141 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci-e2e-elasticsearch.yml b/.github/workflows/ci-e2e-elasticsearch.yml index 1cdda82eb59..e10491e736e 100644 --- a/.github/workflows/ci-e2e-elasticsearch.yml +++ b/.github/workflows/ci-e2e-elasticsearch.yml @@ -53,6 +53,11 @@ jobs: with: go-version: 1.23.x + - name: time settings + run: | + date + echo TZ="$TZ" + - uses: docker/setup-qemu-action@49b3bc8e6bdd4a60e6116a5414239cba5943d3cf # v3.2.0 - name: Run ${{ matrix.version.distribution }} integration tests id: test-execution diff --git a/cmd/jaeger/config-badger.yaml b/cmd/jaeger/config-badger.yaml index 3d222bf37bb..158e241cb53 100644 --- a/cmd/jaeger/config-badger.yaml +++ b/cmd/jaeger/config-badger.yaml @@ -5,6 +5,16 @@ service: receivers: [otlp] processors: [batch] exporters: [jaeger_storage_exporter] + telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 + logs: + level: info + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/config-cassandra.yaml b/cmd/jaeger/config-cassandra.yaml index 0076fa48fd0..3ead38258c2 100644 --- a/cmd/jaeger/config-cassandra.yaml +++ b/cmd/jaeger/config-cassandra.yaml @@ -5,6 +5,16 @@ service: receivers: [otlp] processors: [batch] exporters: [jaeger_storage_exporter] + telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 + logs: + level: debug + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/config-elasticsearch.yaml b/cmd/jaeger/config-elasticsearch.yaml index bae543e39ee..0883dc3b395 100644 --- a/cmd/jaeger/config-elasticsearch.yaml +++ b/cmd/jaeger/config-elasticsearch.yaml @@ -5,6 +5,16 @@ service: receivers: [otlp] processors: [batch] exporters: [jaeger_storage_exporter] + telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 + logs: + level: debug + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/config-kafka-collector.yaml b/cmd/jaeger/config-kafka-collector.yaml index 28e178075a0..b04e05d177b 100644 --- a/cmd/jaeger/config-kafka-collector.yaml +++ b/cmd/jaeger/config-kafka-collector.yaml @@ -10,6 +10,7 @@ service: service.name: jaeger_collector metrics: level: detailed + address: 0.0.0.0:8888 extensions: healthcheckv2: diff --git a/cmd/jaeger/config-kafka-ingester.yaml b/cmd/jaeger/config-kafka-ingester.yaml index a16bc8a9490..e3947f531b7 100644 --- a/cmd/jaeger/config-kafka-ingester.yaml +++ b/cmd/jaeger/config-kafka-ingester.yaml @@ -6,8 +6,11 @@ service: processors: [batch] exporters: [jaeger_storage_exporter] telemetry: + resource: + service.name: jaeger metrics: - address: 0.0.0.0:8889 + level: detailed + address: 0.0.0.0:8888 logs: level: debug diff --git a/cmd/jaeger/config-opensearch.yaml b/cmd/jaeger/config-opensearch.yaml index e6b65e121e5..1fe57d42c39 100644 --- a/cmd/jaeger/config-opensearch.yaml +++ b/cmd/jaeger/config-opensearch.yaml @@ -5,6 +5,16 @@ service: receivers: [otlp] processors: [batch] exporters: [jaeger_storage_exporter] + telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 + logs: + level: debug + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/config-remote-storage.yaml b/cmd/jaeger/config-remote-storage.yaml index 1f5cc699df5..8402281c17b 100644 --- a/cmd/jaeger/config-remote-storage.yaml +++ b/cmd/jaeger/config-remote-storage.yaml @@ -5,6 +5,16 @@ service: receivers: [otlp] processors: [batch] exporters: [jaeger_storage_exporter] + telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 + logs: + level: debug + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/config-tail-sampling-always-sample.yaml b/cmd/jaeger/config-tail-sampling-always-sample.yaml index 9f0f238f52b..cdba426ef10 100644 --- a/cmd/jaeger/config-tail-sampling-always-sample.yaml +++ b/cmd/jaeger/config-tail-sampling-always-sample.yaml @@ -6,8 +6,15 @@ service: processors: [tail_sampling] exporters: [jaeger_storage_exporter] telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 logs: - level: DEBUG + level: debug + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/config-tail-sampling-service-name-policy.yaml b/cmd/jaeger/config-tail-sampling-service-name-policy.yaml index 4b7ffa1fa0a..97ee2a7e53f 100644 --- a/cmd/jaeger/config-tail-sampling-service-name-policy.yaml +++ b/cmd/jaeger/config-tail-sampling-service-name-policy.yaml @@ -6,8 +6,15 @@ service: processors: [tail_sampling] exporters: [jaeger_storage_exporter] telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 logs: - level: DEBUG + level: debug + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/config.yaml b/cmd/jaeger/config.yaml index c1615d86460..2df3addb078 100644 --- a/cmd/jaeger/config.yaml +++ b/cmd/jaeger/config.yaml @@ -5,6 +5,16 @@ service: receivers: [otlp, jaeger, zipkin] processors: [batch, adaptive_sampling] exporters: [jaeger_storage_exporter] + telemetry: + resource: + service.name: jaeger + metrics: + level: detailed + address: 0.0.0.0:8888 + logs: + level: debug + # TODO Initialize telemetery tracer once OTEL released new feature. + # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 extensions: healthcheckv2: diff --git a/cmd/jaeger/internal/all-in-one.yaml b/cmd/jaeger/internal/all-in-one.yaml index bc5433b67c5..5417b95d133 100644 --- a/cmd/jaeger/internal/all-in-one.yaml +++ b/cmd/jaeger/internal/all-in-one.yaml @@ -11,6 +11,8 @@ service: metrics: level: detailed address: 0.0.0.0:8888 + logs: + level: info # TODO Initialize telemetery tracer once OTEL released new feature. # https://github.com/open-telemetry/opentelemetry-collector/issues/10663 diff --git a/plugin/storage/es/spanstore/reader.go b/plugin/storage/es/spanstore/reader.go index 084f1131028..accd1932e53 100644 --- a/plugin/storage/es/spanstore/reader.go +++ b/plugin/storage/es/spanstore/reader.go @@ -135,12 +135,15 @@ func NewSpanReader(p SpanReaderParams) *SpanReader { spanIndex: p.SpanIndex, serviceIndex: p.ServiceIndex, spanConverter: dbmodel.NewToDomain(p.TagDotReplacement), - timeRangeIndices: getTimeRangeIndexFn(p.Archive, p.UseReadWriteAliases, p.RemoteReadClusters), - sourceFn: getSourceFn(p.Archive, p.MaxDocCount), - maxDocCount: p.MaxDocCount, - useReadWriteAliases: p.UseReadWriteAliases, - logger: p.Logger, - tracer: p.Tracer, + timeRangeIndices: getLoggingTimeRangeIndexFn( + p.Logger, + getTimeRangeIndexFn(p.Archive, p.UseReadWriteAliases, p.RemoteReadClusters), + ), + sourceFn: getSourceFn(p.Archive, p.MaxDocCount), + maxDocCount: p.MaxDocCount, + useReadWriteAliases: p.UseReadWriteAliases, + logger: p.Logger, + tracer: p.Tracer, } } @@ -148,6 +151,17 @@ type timeRangeIndexFn func(indexName string, indexDateLayout string, startTime t type sourceFn func(query elastic.Query, nextTime uint64) *elastic.SearchSource +func getLoggingTimeRangeIndexFn(logger *zap.Logger, fn timeRangeIndexFn) timeRangeIndexFn { + if !logger.Core().Enabled(zap.DebugLevel) { + return fn + } + return func(indexName string, indexDateLayout string, startTime time.Time, endTime time.Time, reduceDuration time.Duration) []string { + indices := fn(indexName, indexDateLayout, startTime, endTime, reduceDuration) + logger.Debug("Reading from ES indices", zap.Strings("index", indices)) + return indices + } +} + func getTimeRangeIndexFn(archive, useReadWriteAliases bool, remoteReadClusters []string) timeRangeIndexFn { if archive { var archiveSuffix string diff --git a/plugin/storage/es/spanstore/reader_test.go b/plugin/storage/es/spanstore/reader_test.go index 7d4b2881d49..3b292008dcd 100644 --- a/plugin/storage/es/spanstore/reader_test.go +++ b/plugin/storage/es/spanstore/reader_test.go @@ -22,6 +22,7 @@ import ( "go.opentelemetry.io/otel/sdk/trace/tracetest" "go.opentelemetry.io/otel/trace" "go.uber.org/zap" + "go.uber.org/zap/zaptest" "github.com/jaegertracing/jaeger/internal/metricstest" "github.com/jaegertracing/jaeger/model" @@ -167,7 +168,9 @@ func TestNewSpanReader(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - reader := NewSpanReader(test.params) + params := test.params + params.Logger = zaptest.NewLogger(t) + reader := NewSpanReader(params) require.NotNil(t, reader) assert.Equal(t, test.maxSpanAge, reader.maxSpanAge) }) diff --git a/plugin/storage/es/spanstore/writer.go b/plugin/storage/es/spanstore/writer.go index dd32a843ab3..77e781d8aaf 100644 --- a/plugin/storage/es/spanstore/writer.go +++ b/plugin/storage/es/spanstore/writer.go @@ -128,6 +128,7 @@ func (s *SpanWriter) WriteSpan(_ context.Context, span *model.Span) error { s.writeService(serviceIndexName, jsonSpan) } s.writeSpan(spanIndexName, jsonSpan) + s.logger.Debug("Wrote span to ES index", zap.String("index", spanIndexName)) return nil } diff --git a/plugin/storage/es/spanstore/writer_test.go b/plugin/storage/es/spanstore/writer_test.go index 381f08271db..8781967a94b 100644 --- a/plugin/storage/es/spanstore/writer_test.go +++ b/plugin/storage/es/spanstore/writer_test.go @@ -146,12 +146,10 @@ func TestSpanWriter_WriteSpan(t *testing.T) { expectedLogs []string }{ { - caption: "span insertion error", - + caption: "span insertion error", serviceIndexExists: false, - - expectedError: "", - expectedLogs: []string{}, + expectedError: "", + expectedLogs: []string{"Wrote span to ES index"}, }, } for _, tc := range testCases { diff --git a/plugin/storage/integration/integration.go b/plugin/storage/integration/integration.go index 40c98beebf1..243cc8c9a1c 100644 --- a/plugin/storage/integration/integration.go +++ b/plugin/storage/integration/integration.go @@ -144,7 +144,10 @@ func (s *StorageIntegration) testGetServices(t *testing.T) { found := s.waitForCondition(t, func(t *testing.T) bool { var err error actual, err = s.SpanReader.GetServices(context.Background()) - require.NoError(t, err) + if err != nil { + t.Log(err) + return false + } sort.Strings(actual) t.Logf("Retrieved services: %v", actual) if len(actual) > len(expected) { @@ -246,7 +249,10 @@ func (s *StorageIntegration) testGetOperations(t *testing.T) { var err error actual, err = s.SpanReader.GetOperations(context.Background(), spanstore.OperationQueryParameters{ServiceName: "example-service-1"}) - require.NoError(t, err) + if err != nil { + t.Log(err) + return false + } sort.Slice(actual, func(i, j int) bool { return actual[i].Name < actual[j].Name }) @@ -327,7 +333,10 @@ func (s *StorageIntegration) findTracesByQuery(t *testing.T, query *spanstore.Tr found := s.waitForCondition(t, func(t *testing.T) bool { var err error traces, err = s.SpanReader.FindTraces(context.Background(), query) - require.NoError(t, err) + if err != nil { + t.Log(err) + return false + } if len(expected) != len(traces) { t.Logf("Expecting certain number of traces: expected: %d, actual: %d", len(expected), len(traces)) return false @@ -464,7 +473,10 @@ func (s *StorageIntegration) testGetDependencies(t *testing.T) { found := s.waitForCondition(t, func(t *testing.T) bool { var err error actual, err = s.DependencyReader.GetDependencies(context.Background(), time.Now(), 5*time.Minute) - require.NoError(t, err) + if err != nil { + t.Log(err) + return false + } sort.Slice(actual, func(i, j int) bool { return actual[i].Parent < actual[j].Parent }) @@ -495,7 +507,10 @@ func (s *StorageIntegration) testGetThroughput(t *testing.T) { _ = s.waitForCondition(t, func(t *testing.T) bool { var err error actual, err = s.SamplingStore.GetThroughput(start, start.Add(time.Second*time.Duration(10))) - require.NoError(t, err) + if err != nil { + t.Log(err) + return false + } return assert.ObjectsAreEqualValues(expected, len(actual)) }) assert.Len(t, actual, expected) @@ -517,7 +532,10 @@ func (s *StorageIntegration) testGetLatestProbability(t *testing.T) { found := s.waitForCondition(t, func(t *testing.T) bool { var err error actual, err = s.SamplingStore.GetLatestProbabilities() - require.NoError(t, err) + if err != nil { + t.Log(err) + return false + } return assert.ObjectsAreEqualValues(expected, actual) }) if !assert.True(t, found) { diff --git a/scripts/spm-integration-test.sh b/scripts/spm-integration-test.sh index c38db71a41e..718700294eb 100755 --- a/scripts/spm-integration-test.sh +++ b/scripts/spm-integration-test.sh @@ -67,9 +67,6 @@ wait_for_services() { echo "Waiting for services to be up and running..." check_service_health "Jaeger" "http://localhost:16686" check_service_health "Prometheus" "http://localhost:9090/graph" - # Grafana is not actually important for the functional test, - # but we still validate that the docker-compose file is correct. - check_service_health "Grafana" "http://localhost:3000" } # Function to validate the service metrics @@ -111,7 +108,7 @@ validate_service_metrics() { echo "⏳ Expecting at least 4 non-zero data points" return 1 fi - + # Validate if labels are correct local url="http://localhost:16686/api/metrics/calls?service=${service}&groupByOperation=true&endTs=&lookback=${fiveMinutes}&step=${fifteenSec}&ratePer=${oneMinute}"