From 9feceb6c670a071d40da1302e9b156f16a739096 Mon Sep 17 00:00:00 2001 From: Salvador Fuentes Date: Thu, 7 Mar 2019 10:17:15 -0600 Subject: [PATCH] tracing: Add test to verify tracing works correctly This tracing test runs as follows: 1. Enables tracing on runtime and shim components. 2. Runs a Jaeger container that will receive the trace spans. 3. A kata-container is executed. 4. Verifies that Jaeger has collected a number of spans. 5. Disables tracing. Fixes: #1255. Signed-off-by: Salvador Fuentes --- .ci/configure_tracing_for_kata.sh | 46 ++++++ Makefile | 8 +- lib/common.bash | 14 +- tracing/tracing-test.sh | 250 ++++++++++++++++++++++++++++++ 4 files changed, 313 insertions(+), 5 deletions(-) create mode 100755 .ci/configure_tracing_for_kata.sh create mode 100755 tracing/tracing-test.sh diff --git a/.ci/configure_tracing_for_kata.sh b/.ci/configure_tracing_for_kata.sh new file mode 100755 index 0000000000..4f4af8c407 --- /dev/null +++ b/.ci/configure_tracing_for_kata.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail + +cidir=$(dirname "$0") +source "${cidir}/lib.sh" + +[ "$#" -eq 1 ] || die "Specify enable or disable" + +kata_cfg_file=$(kata-runtime kata-env --json |jq '.Runtime | .Config | .Path' |cut -d\" -f2) + +enable_tracing() { + info "Enabling kata tracing on $kata_cfg_file" + sudo crudini --set "$kata_cfg_file" shim.kata enable_tracing true + sudo crudini --set "$kata_cfg_file" runtime enable_tracing true + sudo crudini --set "$kata_cfg_file" runtime internetworking_model \"none\" + sudo crudini --set "$kata_cfg_file" runtime disable_new_netns true + sudo crudini --set "$kata_cfg_file" netmon enable_netmon false +} + +disable_tracing() { + info "Disabling kata tracing on $kata_cfg_file" + sudo crudini --set "$kata_cfg_file" shim.kata enable_tracing false + sudo crudini --set "$kata_cfg_file" runtime enable_tracing false + sudo crudini --set "$kata_cfg_file" runtime internetworking_model \"macvtap\" + sudo crudini --set "$kata_cfg_file" runtime disable_new_netns false + sudo crudini --set "$kata_cfg_file" netmon enable_netmon false +} + +main() { + cmd="$1" + case "$cmd" in + enable ) enable_tracing ;; + disable ) disable_tracing ;; + *) die "invalid command: '$cmd'" ;; + esac +} + +main "$@" diff --git a/Makefile b/Makefile index b5b65d8b53..ef86fa892c 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,9 @@ TIMEOUT := 60 # union for 'make test' -UNION := functional docker crio docker-compose network netmon docker-stability oci openshift kubernetes swarm vm-factory entropy ramdisk shimv2 +UNION := functional docker crio docker-compose network netmon \ + docker-stability oci openshift kubernetes swarm vm-factory \ + entropy ramdisk shimv2 tracing # skipped test suites for docker integration tests FILTER_FILE = .ci/hypervisors/$(KATA_HYPERVISOR)/filter_docker_$(KATA_HYPERVISOR).sh @@ -143,6 +145,9 @@ netmon: bash -f .ci/install_bats.sh bats integration/netmon/netmon_test.bats +tracing: + bash tracing/tracing-test.sh + test: ${UNION} check: checkcommits log-parser @@ -168,4 +173,5 @@ check: checkcommits log-parser network \ ramdisk \ test \ + tracing \ vm-factory diff --git a/lib/common.bash b/lib/common.bash index 553e24dfee..64b4b422b9 100755 --- a/lib/common.bash +++ b/lib/common.bash @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2018 Intel Corporation +# Copyright (c) 2018-2019 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 @@ -12,14 +12,20 @@ VC_POD_DIR="${VC_POD_DIR:-/var/lib/vc/sbs}" KATA_HYPERVISOR="${KATA_HYPERVISOR:-qemu}" -die(){ - msg="$*" +die() { + local msg="$*" echo "ERROR: $msg" >&2 exit 1 } +warn() { + local msg="$*" + echo "WARNING: $msg" +} + info() { - echo -e "INFO: $*" + local msg="$*" + echo "INFO: $msg" } # Check if the $1 argument is the name of a 'known' diff --git a/tracing/tracing-test.sh b/tracing/tracing-test.sh new file mode 100755 index 0000000000..5bd8f513c9 --- /dev/null +++ b/tracing/tracing-test.sh @@ -0,0 +1,250 @@ +#!/bin/bash +# Copyright (c) 2019 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +# + +set -o errexit +set -o nounset +set -o pipefail +set -o errtrace + +DEBUG=${DEBUG:-} +[ -n "$DEBUG" ] && set -o xtrace + +SCRIPT_PATH=$(dirname "$(readlink -f "$0")") +source "${SCRIPT_PATH}/../lib/common.bash" + +RUNTIME=${RUNTIME:-kata-runtime} + +jaeger_server=${jaeger_server:-localhost} +jaeger_ui_port=${jaeger_ui_port:-16686} +jaeger_docker_container_name="jaeger" + +# Cleanup will remove Jaeger container and +# disable tracing. +cleanup(){ + stop_jaeger 2>/dev/null || true + .ci/configure_tracing_for_kata.sh disable +} + +trap cleanup EXIT + +# Run an operation to generate Jaeger trace spans +create_trace() +{ + sudo docker run -ti --runtime "$RUNTIME" --net=none --rm busybox true +} + +start_jaeger() +{ + local jaeger_docker_image="jaegertracing/all-in-one:latest" + + # Defaults - see https://www.jaegertracing.io/docs/getting-started/ + docker run -d --runtime runc --name "${jaeger_docker_container_name}" \ + -e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \ + -p 5775:5775/udp \ + -p 6831:6831/udp \ + -p 6832:6832/udp \ + -p 5778:5778 \ + -p "${jaeger_ui_port}:${jaeger_ui_port}" \ + -p 14268:14268 \ + -p 9411:9411 \ + "$jaeger_docker_image" +} + +stop_jaeger() +{ + docker stop "${jaeger_docker_container_name}" + docker rm -f "${jaeger_docker_container_name}" +} + +# Returns status from Jaeger web UI +get_jaeger_status() +{ + local service="$1" + + [ -z "$service" ] && die "need jaeger service name" + + local attempt=0 + local status="" + + while [ $attempt -lt 10 ] + do + status=$(curl -s "http://${jaeger_server}:${jaeger_ui_port}/api/traces?service=${service}" 2>/dev/null) + local ret=$? + + [ "$ret" -eq 0 ] && [ -n "$status" ] && break + + attempt=$((attempt++)) + sleep 1 + done + + echo "$status" +} + +# Look for any "dangling" spans that have not been reported to the Jaeger +# agent. +check_missing_spans() +{ + local service="$1" + local min_spans="$2" + + [ -z "$service" ] && die "need jaeger service name" + [ -z "$min_spans" ] && die "need minimum trace span count" + + local logfile=$(mktemp) + + sudo journalctl -q -o cat -a -t "$service" > "$logfile" + sudo chown "$USER:" "$logfile" + + # This message needs to be logged by each Kata component, generally when + # debug is enabled. + local component_prefix="created span" + + # Message prefix added by Jaeger when LogSpans=true + # (see + # https://godoc.org/github.com/uber/jaeger-client-go/config#ReporterConfig). + local jaeger_reporter_prefix="Reporting span" + + local logged_spans=$(grep -E -o "${component_prefix} [^ ][^ ]*" "$logfile" | awk '{print $3}') + + if [ -z "$logged_spans" ] + then + info "No logged spans to check" + rm -f "$logfile" + return + fi + + local count=0 + + for span in $logged_spans + do + count=$((count+1)) + + # Remove quotes + span=$(echo $span|tr -d '"') + + grep -E -q "${jaeger_reporter_prefix} \<$span\>" "$logfile" || \ + die "span $count ($span) not reported" + done + + [ "$count" -lt "$min_spans" ] && \ + die "expected >= $min_spans reported spans, got $count" + + info "All $count spans reported" + + rm -f "$logfile" +} + +# Check Jaeger spans for the specified service. +check_jaeger_status() +{ + local service="$1" + local min_spans="$2" + + [ -z "$service" ] && die "need jaeger service name" + [ -z "$min_spans" ] && die "need minimum trace span count" + + local status + local errors=0 + + local attempt=0 + local attempts=3 + + while [ "$attempt" -lt "$attempts" ] + do + status=$(get_jaeger_status "$service") + + #------------------------------ + # Basic sanity checks + [ -z "$status" ] && die "failed to query status via HTTP" + + local span_lines=$(echo "$status"|jq -S '.data[].spans | length') + [ -z "$span_lines" ] && die "no span status" + + local span_lines_count=$(echo "$span_lines"|wc -l) + + # Total up all span counts + local spans=$(echo "$span_lines"|paste -sd+ -|bc) + [ -z "$spans" ] && die "no spans" + + # Ensure total span count is numeric + echo "$spans"|grep -q "^[0-9][0-9]*$" + [ $? -eq 0 ] || die "invalid span count: '$spans'" + + info "found $spans spans (across $span_lines_count traces)" + + # Validate + [ "$spans" -lt "$min_spans" ] && die "expected >= $min_spans spans, got $spans" + + # Look for common errors in span data + local errors1=$(echo "$status"|jq -S . 2>/dev/null|grep "invalid parent span") + if [ -n "$errors1" ] + then + errors=$((errors+1)) + warn "Found invalid parent span errors (attempt $attempt): $errors1" + continue + else + errors=$((errors-1)) + [ "$errors" -lt 0 ] && errors=0 + fi + + # Crude but it works + local errors2=$(echo "$status"|jq -S . 2>/dev/null|grep "\"warnings\""|grep -E -v "\") + if [ -n "$errors2" ] + then + errors=$((errors+1)) + warn "Found warnings (attempt $attempt): $errors2" + continue + else + errors=$((errors-1)) + [ "$errors" -lt 0 ] && errors=0 + fi + + attempt=$((attempt++)) + + [ "$errors" -eq 0 ] && break + done + + [ "$errors" -eq 0 ] || die "errors still detected after $attempts attempts" +} + +run_test() +{ + local min_spans="$1" + local service="$2" + + [ -z "$min_spans" ] && die "need minimum span count" + [ -z "$service" ] && die "need service name" + + create_trace + + check_jaeger_status "$service" "$min_spans" + + check_missing_spans "$service" "$min_spans" + + info "test passed" +} + +main() +{ + runtime_min_spans=10 + shim_min_spans=5 + + # Name of Jaeger "services" (aka Kata components) to check trace for. + runtime_service="kata-runtime" + shim_service="kata-shim" + + start_jaeger + + .ci/configure_tracing_for_kata.sh enable + + info "Checking runtime spans" + run_test "$runtime_min_spans" "$runtime_service" + + info "Checking shim spans" + run_test "$shim_min_spans" "$shim_service" +} + +main "$@"