From 2f2dd9e84dd6fea65a04e03cff786a8c1b86fc43 Mon Sep 17 00:00:00 2001 From: Hisar Balik Date: Tue, 10 Sep 2024 10:35:12 +0200 Subject: [PATCH] fix: Flaky fluent-bit load test setup (#1404) Co-authored-by: Nina Hingerl <76950046+NHingerl@users.noreply.github.com> --- docs/contributor/benchmarks/README.md | 30 +++++++++-------- hack/load-tests/log-fluentbit-test-setup.yaml | 7 ++-- hack/load-tests/run-load-test.sh | 32 +++++++++---------- 3 files changed, 37 insertions(+), 32 deletions(-) diff --git a/docs/contributor/benchmarks/README.md b/docs/contributor/benchmarks/README.md index bc09d1366..15838293a 100644 --- a/docs/contributor/benchmarks/README.md +++ b/docs/contributor/benchmarks/README.md @@ -337,21 +337,25 @@ test results are printed out.
-| Version/Test | Single Pipeline (ci-logs) | | | | | Multi Pipeline (ci-logs-m) | | | | | Single Pipeline Backpressure (ci-logs-b) | | | | | Multi Pipeline Backpressure (ci-logs-mb) | | | | | -|--------------------:|:---------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:|:---------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:|:----------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:|:----------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:| -| | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | -| 2.2.1 | 5165 | 8541 | 68518 | 172, 190 | 1, 1 | 2009 | 2195 | 102932 | 332, 320 | 0.9, 0.9 | 5914 | 1498 | 79247 | 184, 176 | 0.9, 1 | 1979 | 489 | 83442 | 310, 322 | 0.9, 0.9 | -| 2.2.2 | 5159 | 7811 | 75545 | 171, 170 | 1, 1 | 1910 | 2516 | 103780 | 324, 324 | 0.9, 0.9 | 5857 | 1513 | 72494 | 189, 200 | 1, 1 | 1860 | 421 | 90852 | 314, 322 | 0.9, 0.9 | -| 2.2.2 (new setup) | 5445 | 9668 | 68453 | 248, 981 | 1, 1 | 6201 | 2747 | 89291 | 544, 720 | 1, 1 | 6009 | 1723 | 58982 | 650, 682 | 1, 1 | 6111 | 385 | 108909 | 686, 931 | 0.9, 0.9 | -| 3.0.3 | 9483 | 22042 | 53251 | 366, 681 | 1, 1 | 10737 | 8785 | 115232 | 953, 568 | 0.9, 0.9 | 10425 | 4610 | 80614 | 856, 704 | 0.9, 0.9 | 10955 | 1724 | 87530 | 503, 594 | 0.9 ,0.9 | -| 3.0.4 | 4341 | 8296 | 35628 | 971, 726 | 0.1,0.1 | 1201 | 544 | 103624 | 652, 815 | 0, 0 | 932 | 297 | 37663 | 615, 726 | 0.1,0.1 | 1477 | 171 | 108885 | 530, 566 | 0, 0.1 | -| 3.0.7 (old metrics) | 4241 | 7782 | 47586 | 815,1021 | 0.7,0.1 | 3809 | 1968 | 107529 | 837,965 | 0.4,0 | 3472 | 1093 | 33818 | 792,597 | 0,0.1 | 2180 | 177 | 87052 | 708,631 | 0,0.1 | -| 3.0.7 (new metrics) | 4036 | 7173 | 31689 | 825,852 | 0.1,0.1 | 2481 | 1852 | 104689 | 747,395 | 0.1,0 | 1520 | 484 | 37907 | 561,731 | 0.1,0.1 | 807 | 58 | 94365 | 544,211 | 0,0 | -| 3.0.7 (new) | 9514 | 30273 | 30263 | 105, 113 | 1, 1 | 9027 | 23850 | 1521511 | 186, 552 | 1, 0.7 | 7285 | 8357 | 1891569 | 662, 668 | 0.8, 0.8 | 5602 | 2619 | 5249308 | 680, 713 | 0.5, 0.5 | -| 3.1.3 | 8922 | 28278 | 34609 | 105,107 | 0.8,0.9 | 4542 | 9605 | 2676743 | 601,528 | 0.4,0.4 | 3764 | 4216 | 1896390 | 620,636 | 0.4,0.4 | 3336 | 1499 | 4892724 | 678,698 | 0.3,0.3 | -| 3.1.6 | 9423 | 30652 | 37562 | 105 | 0.9 | 4522 | 9517 | 2414417 | 631 | 0.4 | 4002 | 4945 | 1897270 | 693 | 0.5 | 3224 | 1680 | 4898631 | 634 | 0.3 | +| Version/Test | Single Pipeline (ci-logs) | | | | | Multi Pipeline (ci-logs-m) | | | | | Single Pipeline Backpressure (ci-logs-b) | | | | | Multi Pipeline Backpressure (ci-logs-mb) | | | | | +|---------------------------:|:---------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:|:---------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:|:----------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:|:----------------------------------------:|:----------------------------------------:|:-------------------------------:|:--------------------:|:-------------:| +| | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | Input Bytes Processing Rate/sec (KByte) | Output Bytes Processing Rate/sec (KByte) | Filesystem Buffer Usage (KByte) | Pod Memory Usage(MB) | Pod CPU Usage | +| 2.2.1 | 5165 | 8541 | 68518 | 172, 190 | 1, 1 | 2009 | 2195 | 102932 | 332, 320 | 0.9, 0.9 | 5914 | 1498 | 79247 | 184, 176 | 0.9, 1 | 1979 | 489 | 83442 | 310, 322 | 0.9, 0.9 | +| 2.2.2 | 5159 | 7811 | 75545 | 171, 170 | 1, 1 | 1910 | 2516 | 103780 | 324, 324 | 0.9, 0.9 | 5857 | 1513 | 72494 | 189, 200 | 1, 1 | 1860 | 421 | 90852 | 314, 322 | 0.9, 0.9 | +| 2.2.2 (new setup) | 5445 | 9668 | 68453 | 248, 981 | 1, 1 | 6201 | 2747 | 89291 | 544, 720 | 1, 1 | 6009 | 1723 | 58982 | 650, 682 | 1, 1 | 6111 | 385 | 108909 | 686, 931 | 0.9, 0.9 | +| 3.0.3 | 9483 | 22042 | 53251 | 366, 681 | 1, 1 | 10737 | 8785 | 115232 | 953, 568 | 0.9, 0.9 | 10425 | 4610 | 80614 | 856, 704 | 0.9, 0.9 | 10955 | 1724 | 87530 | 503, 594 | 0.9, 0.9 | +| 3.0.4 | 4341 | 8296 | 35628 | 971, 726 | 0.1, 0.1 | 1201 | 544 | 103624 | 652, 815 | 0, 0 | 932 | 297 | 37663 | 615, 726 | 0.1, 0.1 | 1477 | 171 | 108885 | 530, 566 | 0, 0.1 | +| 3.0.7 (old metrics) | 4241 | 7782 | 47586 | 815,1021 | 0.7, 0.1 | 3809 | 1968 | 107529 | 837,965 | 0.4, 0 | 3472 | 1093 | 33818 | 792, 597 | 0, 0.1 | 2180 | 177 | 87052 | 708, 631 | 0, 0.1 | +| 3.0.7 (new metrics) | 4036 | 7173 | 31689 | 825,852 | 0.1, 0.1 | 2481 | 1852 | 104689 | 747,395 | 0.1, 0 | 1520 | 484 | 37907 | 561, 731 | 0.1, 0.1 | 807 | 58 | 94365 | 544, 211 | 0,0 | +| 3.0.7 (new) | 9514 | 30273 | 30263 | 105, 113 | 1, 1 | 9027 | 23850 | 1521511 | 186, 552 | 1, 0.7 | 7285 | 8357 | 1891569 | 662, 668 | 0.8, 0.8 | 5602 | 2619 | 5249308 | 680, 713 | 0.5, 0.5 | +| 3.1.3 | 8922 | 28278 | 34609 | 105,107 | 0.8, 0.9 | 4542 | 9605 | 2676743 | 601,528 | 0.4, 0.4 | 3764 | 4216 | 1896390 | 620, 636 | 0.4, 0.4 | 3336 | 1499 | 4892724 | 678, 698 | 0.3, 0.3 | +| 3.1.6 | 9423 | 30652 | 37562 | 105 | 0.9 | 4522 | 9517 | 2414417 | 631 | 0.4 | 4002 | 4945 | 1897270 | 693 | 0.5 | 3224 | 1680 | 4898631 | 634 | 0.3 | +| 3.1.3 (after optimization) | 9149 | 29694 | 37883 | 93, 98 | 1, 0.8 | 4344 | 8354 | 2893257 | 601, 587 | 0.4, 0.4 | 3715 | 4000 | 1947742 | 739, 692 | 0.4, 0.4 | 3200 | 1314 | 4950684 | 662, 693 | 0.3, 0.3 | +| 3.1.6 (after optimization) | 10183 | 33117 | 30358 | 98, 93 | 1, 1 | 4349 | 8935 | 2779129 | 539, 660 | 0.4, 0.4 | 5471 | 5840 | 1889836 | 646, 641 | 0.5, 0.6 | 3224 | 1621 | 4882502 | 686, 649 | 0.3, 0.3 |
+> **NOTE:** The test results can vary up to 1000KByte/sec in the input, and up to 3000KByte/sec in the output processing rate with single pipeline tests. The multi pipeline test results can be lower than the single pipeline test results, because the test setup is shared between multiple pipelines and this may create pressure on the shared resources. +> ## Self Monitor ### Assumptions diff --git a/hack/load-tests/log-fluentbit-test-setup.yaml b/hack/load-tests/log-fluentbit-test-setup.yaml index 23c7c7ef9..7a252d958 100644 --- a/hack/load-tests/log-fluentbit-test-setup.yaml +++ b/hack/load-tests/log-fluentbit-test-setup.yaml @@ -11,14 +11,15 @@ metadata: namespace: log-load-test data: fluent.conf: |- + + workers 2 + @type http port 9880 bind 0.0.0.0 - body_size_limit 50m - add_http_headers true - @type json + @type none diff --git a/hack/load-tests/run-load-test.sh b/hack/load-tests/run-load-test.sh index 91980d1da..4b1f7946d 100755 --- a/hack/load-tests/run-load-test.sh +++ b/hack/load-tests/run-load-test.sh @@ -266,8 +266,8 @@ function get_result_and_cleanup_trace() { RESULT_RECEIVED=$(curl -fs --data-urlencode "$QUERY_RECEIVED" $PROMAPI | jq -r '.data.result[] | .value[1]') RESULT_EXPORTED=$(curl -fs --data-urlencode "$QUERY_EXPORTED" $PROMAPI | jq -r '.data.result[] | .value[1]') RESULT_QUEUE=$(curl -fs --data-urlencode "$QUERY_QUEUE" $PROMAPI | jq -r '.data.result[] | .value[1]') - RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]') - RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') + RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') RESULT_RESTARTS_COLLECTOR=$(kubectl -n kyma-system get pod -l app.kubernetes.io/name=telemetry-trace-collector -ojsonpath='{.items[0].status.containerStatuses[*].restartCount}' | jq -s 'add') kill %1 @@ -293,8 +293,8 @@ function get_result_and_cleanup_metric() { RESULT_RECEIVED=$(curl -fs --data-urlencode "$QUERY_RECEIVED" $PROMAPI | jq -r '.data.result[] | .value[1]') RESULT_EXPORTED=$(curl -fs --data-urlencode "$QUERY_EXPORTED" $PROMAPI | jq -r '.data.result[] | .value[1]') RESULT_QUEUE=$(curl -fs --data-urlencode "$QUERY_QUEUE" $PROMAPI | jq -r '.data.result[] | .value[1]') - RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]') - RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') + RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') RESULT_RESTARTS_GATEWAY=$(kubectl -n kyma-system get pod -l app.kubernetes.io/name=telemetry-metric-gateway -ojsonpath='{.items[0].status.containerStatuses[*].restartCount}' | jq -s 'add') kill %1 @@ -321,8 +321,8 @@ function get_result_and_cleanup_metricagent() { RESULT_RECEIVED=$(curl -fs --data-urlencode "$QUERY_RECEIVED" $PROMAPI | jq -r '.data.result[] | .value[1]') RESULT_EXPORTED=$(curl -fs --data-urlencode "$QUERY_EXPORTED" $PROMAPI | jq -r '.data.result[] | .value[1]') RESULT_QUEUE=$(curl -fs --data-urlencode "$QUERY_QUEUE" $PROMAPI | jq -r '.data.result[] | .value[1]') - RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]') - RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') + RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') RESULT_RESTARTS_GATEWAY=$(kubectl -n kyma-system get pod -l app.kubernetes.io/name=telemetry-metric-gateway -ojsonpath='{.items[0].status.containerStatuses[*].restartCount}' | jq -s 'add') RESULT_RESTARTS_AGENT=$(kubectl -n kyma-system get pod -l app.kubernetes.io/name=telemetry-metric-agent -ojsonpath='{.items[0].status.containerStatuses[*].restartCount}' | jq -s 'add') @@ -341,11 +341,11 @@ function get_result_and_cleanup_log_otel() { QUERY_MEMORY='query=round(sum(avg_over_time(container_memory_working_set_bytes{namespace="log-load-test", container="collector"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="log-load-test", workload="log-gateway"}[20m])) by (pod) / 1024 / 1024)' QUERY_CPU='query=round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="log-load-test"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="log-load-test", workload="log-gateway"}[20m])) by (pod), 0.1)' - RESULT_RECEIVED=$(curl -fs --data-urlencode "$QUERY_RECEIVED" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_EXPORTED=$(curl -fs --data-urlencode "$QUERY_EXPORTED" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_QUEUE=$(curl -fs --data-urlencode "$QUERY_QUEUE" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[0].value[1]') + RESULT_RECEIVED=$(curl -fs --data-urlencode "$QUERY_RECEIVED" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_EXPORTED=$(curl -fs --data-urlencode "$QUERY_EXPORTED" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_QUEUE=$(curl -fs --data-urlencode "$QUERY_QUEUE" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') + RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') RESULT_RESTARTS_GATEWAY=$(kubectl -n log-load-test get pod -l app.kubernetes.io/name=log-gateway -ojsonpath='{.items[0].status.containerStatuses[*].restartCount}' | jq -s 'add') RESULT_RESTARTS_GENERATOR=$(kubectl -n log-load-test get pod -l app.kubernetes.io/name=log-load-generator -ojsonpath='{.items[0].status.containerStatuses[*].restartCount}' | jq -s 'add') @@ -367,11 +367,11 @@ function get_result_and_cleanup_fluentbit() { QUERY_MEMORY='query=round(sum(avg_over_time(container_memory_working_set_bytes{namespace="kyma-system", container="fluent-bit"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-fluent-bit"}[20m])) by (pod) / 1024 / 1024)' QUERY_CPU='query=round(sum(avg_over_time(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{namespace="kyma-system"}[20m]) * on(namespace,pod) group_left(workload) avg_over_time(namespace_workload_pod:kube_pod_owner:relabel{namespace="kyma-system", workload="telemetry-fluent-bit"}[20m])) by (pod), 0.1)' - RESULT_RECEIVED=$(curl -fs --data-urlencode "$QUERY_RECEIVED" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_EXPORTED=$(curl -fs --data-urlencode "$QUERY_EXPORTED" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_QUEUE=$(curl -fs --data-urlencode "$QUERY_QUEUE" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[0].value[1]') - RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[0].value[1]') + RESULT_RECEIVED=$(curl -fs --data-urlencode "$QUERY_RECEIVED" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_EXPORTED=$(curl -fs --data-urlencode "$QUERY_EXPORTED" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_QUEUE=$(curl -fs --data-urlencode "$QUERY_QUEUE" $PROMAPI | jq -r '.data.result[] | .value[1]') + RESULT_MEMORY=$(curl -fs --data-urlencode "$QUERY_MEMORY" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') + RESULT_CPU=$(curl -fs --data-urlencode "$QUERY_CPU" $PROMAPI | jq -r '.data.result[] | .value[1]' | tr '\n' ',') RESULT_RESTARTS_FLUENTBIT=$(kubectl -n kyma-system get pod -l app.kubernetes.io/name=fluent-bit -ojsonpath='{.items[0].status.containerStatuses[*].restartCount}' | jq -s 'add') kill %1