diff --git a/.buildkite/benchmarks.pipeline.yml b/.buildkite/benchmarks.pipeline.yml new file mode 100644 index 00000000000..203188cde35 --- /dev/null +++ b/.buildkite/benchmarks.pipeline.yml @@ -0,0 +1,153 @@ +# Copied from pipeline.yml. +docker_plugin_default_config: &docker_plugin_default_config + image: "oasislabs/testing:0.3.0" + always_pull: true + workdir: /workdir + volumes: + - /var/lib/buildkite-agent/.coveralls:/root/.coveralls + - /var/lib/buildkite-agent/.codecov:/root/.codecov + # Shared Rust incremental compile caches. + - /var/tmp/cargo_ic/debug:/var/tmp/artifacts/debug/incremental + - /var/tmp/cargo_ic/debug_sgx:/var/tmp/artifacts/x86_64-unknown-linux-sgx/debug/incremental + # Shared Rust package checkouts directory. + - /var/tmp/cargo_pkg/git:/root/.cargo/git + - /var/tmp/cargo_pkg/registry:/root/.cargo/registry + # Shared Rust SGX standard library artifacts cache. + - /var/tmp/xargo_cache:/root/.xargo + # Shared Go package checkouts directory. + - /var/tmp/go_pkg:/root/go/pkg + # Intel SGX Application Enclave Services Manager (AESM) daemon running on + # the Buildkite host. + - /var/run/aesmd/aesm.socket:/var/run/aesmd/aesm.socket + - /var/tmp/benchmarks:/var/tmp/benchmarks + environment: + - "LC_ALL=C.UTF-8" + - "LANG=C.UTF-8" + - "CARGO_TARGET_DIR=/var/tmp/artifacts" + - "CARGO_INSTALL_ROOT=/root/.cargo" + - "GOPROXY=https://proxy.golang.org/" + - "SLACK_WEBHOOK_URL" + - "METRICS_PUSH_ADDR" + - "METRICS_QUERY_ADDR" + - "METRICS_SOURCE_GIT_BRANCH" + - "METRICS_TARGET_GIT_BRANCH" + - "TESTS" + propagate-environment: true + unconfined: true + +docker_plugin: &docker_plugin + oasislabs/docker#v3.0.1-oasis1: + <<: *docker_plugin_default_config + +steps: + ############################################################### + # The following three steps are copied from code.pipeline.yml # + ############################################################### + ############ + # Build jobs + ############ + # This label needs to be synced with runtime-ethereum's + # .buildkite/scripts/download_utils.sh. + - label: Build Go node + command: + - .buildkite/go/build.sh + + # Upload the built artifacts. + - cd /workdir/go/oasis-node + - buildkite-agent artifact upload oasis-node + - cd /workdir/go/oasis-node/integrationrunner + - buildkite-agent artifact upload integrationrunner.test + - cd /workdir/go/oasis-test-runner + - buildkite-agent artifact upload oasis-test-runner + - cd /workdir/go/oasis-net-runner + - buildkite-agent artifact upload oasis-net-runner + - cd /workdir/go/oasis-remote-signer + - buildkite-agent artifact upload oasis-remote-signer + plugins: + <<: *docker_plugin + + # This label needs to be synced with runtime-ethereum's + # .buildkite/scripts/download_utils.sh. + - label: Build Rust runtime loader + command: + - .buildkite/rust/build_generic.sh /workdir -p oasis-core-runtime-loader + - .buildkite/rust/build_generic.sh /workdir -p test-long-term-client + - .buildkite/rust/build_generic.sh /workdir -p simple-keyvalue-client + - .buildkite/rust/build_generic.sh /workdir -p simple-keyvalue-enc-client + - .buildkite/rust/build_generic.sh /workdir -p simple-keyvalue-ops-client + + # Upload the built artifacts. + - cd /var/tmp/artifacts/default/debug + - buildkite-agent artifact upload oasis-core-runtime-loader + # Clients for E2E tests. + - buildkite-agent artifact upload test-long-term-client + - buildkite-agent artifact upload simple-keyvalue-client + - buildkite-agent artifact upload simple-keyvalue-enc-client + - buildkite-agent artifact upload simple-keyvalue-ops-client + agents: + buildkite_agent_size: large + plugins: + <<: *docker_plugin + + #################### + # Runtime build jobs + #################### + # This label needs to be synced with runtime-ethereum's + # .buildkite/rust/test_runtime_and_gateway.sh and .buildkite/scripts/download_utils.sh. + - label: Build key manager runtime + command: + - .buildkite/rust/build_runtime.sh keymanager-runtime + - .buildkite/rust/build_runtime.sh tests/runtimes/simple-keyvalue + + # Upload the built artifacts. + - cd /var/tmp/artifacts/sgx/x86_64-fortanix-unknown-sgx/debug + - buildkite-agent artifact upload oasis-core-keymanager-runtime.sgxs + - buildkite-agent artifact upload simple-keyvalue.sgxs + - cd /var/tmp/artifacts/default/debug + - buildkite-agent artifact upload oasis-core-keymanager-runtime + - buildkite-agent artifact upload simple-keyvalue + agents: + buildkite_agent_size: large + plugins: + <<: *docker_plugin + + ########################################################## + # Build benchmark analysis tool, perform tests, evaluate # + ########################################################## + - label: Build benchmark analysis tool + command: + - cd /workdir/go/extra/ba + - go build + - buildkite-agent artifact upload ba + plugins: + <<: *docker_plugin + + # Wait for all jobs defined before this point + # to finish running in parallel before continuing. + - wait + + - label: E2E tests + parallelism: 1 + timeout_in_minutes: 90 + command: + - .buildkite/scripts/download_e2e_test_artifacts.sh + - rm -rf /var/tmp/benchmarks/* + - .buildkite/scripts/test_e2e.sh --metrics.address $METRICS_PUSH_ADDR --metrics.push.interval 5s --num_runs 3 -t $TESTS + env: + TEST_BASE_DIR: /var/tmp/benchmarks + agents: + buildkite_agent_size: large + buildkite_agent_class: stable + plugins: + <<: *docker_plugin + + # Wait for all jobs defined before this point + # to finish running in parallel before continuing. + - wait + + - label: Benchmark analysis + command: + - .buildkite/scripts/download_benchmark_artifacts.sh + - .buildkite/scripts/daily_benchmark_analysis.sh + plugins: + <<: *docker_plugin diff --git a/.buildkite/scripts/daily_benchmark_analysis.sh b/.buildkite/scripts/daily_benchmark_analysis.sh new file mode 100755 index 00000000000..365a6ffedc0 --- /dev/null +++ b/.buildkite/scripts/daily_benchmark_analysis.sh @@ -0,0 +1,24 @@ +#! /bin/bash + +set -ux + +# Script invoked from .buildkite/benchmarks.pipeline.yml + +./go/extra/ba/ba cmp --metrics.address $METRICS_QUERY_ADDR --metrics.target.git_branch $METRICS_TARGET_GIT_BRANCH --metrics.source.git_branch $METRICS_SOURCE_GIT_BRANCH -t $TESTS 2>error.txt 1>out.txt +BA_RETURN_CODE=$? + +# Escape double quotes for JSON. +BA_STDOUT=`cat out.txt | sed "s/\"/\\\\\\\\\"/g"` +BA_STDERR=`cat error.txt | sed "s/\"/\\\\\\\\\"/g"` + +if [ $BA_RETURN_CODE != 0 ]; then + # Post error to slack channel. + curl -H "Content-Type: application/json" \ + -X POST \ + --data "{\"text\": \"Daily oasis-core benchmarks failed.\", \"attachments\":[{\"title\":\"stdout\",\"text\":\"$BA_STDOUT\"}, {\"title\":\"stderr\",\"text\":\"$BA_STDERR\"}]}" \ + "$SLACK_WEBHOOK_URL" + + # Exit with non-zero exit code, so that the buildkite build will be + # marked as failed. + exit 1 +fi diff --git a/.buildkite/scripts/download_benchmark_artifacts.sh b/.buildkite/scripts/download_benchmark_artifacts.sh new file mode 100755 index 00000000000..8f55b6cfab7 --- /dev/null +++ b/.buildkite/scripts/download_benchmark_artifacts.sh @@ -0,0 +1,14 @@ +#! /bin/bash + +#################################################### +# Download artifacts needed to perform benchmarks. # +#################################################### + +# Helpful tips on writing build scripts: +# https://buildkite.com/docs/pipelines/writing-build-scripts +set -euxo pipefail + +source .buildkite/scripts/common.sh + +# Extra tools +download_artifact ba go/extra/ba 755 diff --git a/.buildkite/scripts/test_e2e.sh b/.buildkite/scripts/test_e2e.sh index 0751fe70a80..a5eaf049171 100755 --- a/.buildkite/scripts/test_e2e.sh +++ b/.buildkite/scripts/test_e2e.sh @@ -44,7 +44,7 @@ ${WORKDIR}/go/oasis-test-runner/oasis-test-runner \ --e2e.runtime.binary_dir ${WORKDIR}/target/${runtime_target}/debug \ --e2e.runtime.loader ${WORKDIR}/target/default/debug/oasis-core-runtime-loader \ --e2e.tee_hardware ${OASIS_TEE_HARDWARE:-""} \ - --remote_signer.binary ${WORKDIR}/go/oasis-remote-signer/oasis-remote-signer \ + --params.remote-signer/basic.binary ${WORKDIR}/go/oasis-remote-signer/oasis-remote-signer \ --log.level info \ ${BUILDKITE_PARALLEL_JOB_COUNT:+--parallel.job_count ${BUILDKITE_PARALLEL_JOB_COUNT}} \ ${BUILDKITE_PARALLEL_JOB:+--parallel.job_index ${BUILDKITE_PARALLEL_JOB}} \ diff --git a/bench.sh b/bench.sh new file mode 100755 index 00000000000..9308f98bb10 --- /dev/null +++ b/bench.sh @@ -0,0 +1,85 @@ +#/bin/bash + +# bench.sh and parse_bench.py are used to benchmark specific e2e test using +# various execution parameters and parse the results. +# Accepted environmental variables are: +# - GS: executor group size, +# - RT: number of runtimes used, +# - W: number of compute workers used. +# +# Because of syncing and flushing caches, you need to run this script as root. +# For example: +# sudo GS=2 RT=4 W=2 ./bench.sh multiple-runtimes +# +# If no environmental variables are set, this script benchmarks all GS and RT +# combinations from 1..128 in power of 2. W will equal GS for each run. + +set -x + +if [ "$#" -ne 1 ] +then + echo "Usage: $0 " + exit 1 +fi + +if [ "$USER" != "root" ] +then + echo "This script should be run as root." + exit 2 +fi + +BENCH_RESULTS_DIR=bench-results +TEST="$1" + +cleanup() { + killall -q mpstat + killall -q iostat + killall -q sar +} + +trap cleanup EXIT +trap "exit" INT + +mkdir -p bench-results + +# Environmental variables GS and RT. +GROUP_SIZES=$GS +RUNTIMES=$RT +if [ -z "$GROUP_SIZES" ] +then + GROUP_SIZES="1 2 3 4 6 8" +fi +if [ -z "$RUNTIMES" ] +then + RUNTIMES="1 2 3 4 5 6" +fi + +for G in $GROUP_SIZES; do + for R in $RUNTIMES; do + if [ -z "$W" ] + then + WORKERS=$G + else + WORKERS=$W + fi + + echo "BENCHMARK: Executing test $TEST with executor group size $G, runtime count $R, and compute workers $WORKERS..." + BENCH_INFO=GS_$G.RT_$R.W_$WORKERS + + # Flush all to disk, clear any caches. + sync; echo 3 > /proc/sys/vm/drop_caches + + mpstat -P ALL 1 >$BENCH_RESULTS_DIR/$TEST.$BENCH_INFO.mpstat & + iostat -d /dev/sda -x 1 >$BENCH_RESULTS_DIR/$TEST.$BENCH_INFO.iostat & + sar -r 1 >$BENCH_RESULTS_DIR/$TEST.$BENCH_INFO.sar & + + + EXECUTOR_GROUP_SIZE=$G \ + COMPUTE_RUNTIMES=$R \ + COMPUTE_WORKERS=$WORKERS \ + /usr/bin/time -o $BENCH_RESULTS_DIR/$TEST.$BENCH_INFO.time \ + .buildkite/scripts/test_e2e.sh -t $TEST + + cleanup + done +done diff --git a/common.mk b/common.mk index c82a5dbed17..1fca7fcccbc 100644 --- a/common.mk +++ b/common.mk @@ -49,6 +49,8 @@ ifeq ($(and $(LATEST_TAG),$(IS_TAG)),NO) endif export VERSION +GIT_BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) + # Try to compute the next version based on the latest tag of the origin remote # using the Punch tool. # First, all tags from the origin remote are fetched. Next, the latest tag on @@ -90,7 +92,7 @@ GOFLAGS ?= -trimpath -v # Add Oasis Core's version as a linker string value definition. ifneq ($(VERSION),) - export GOLDFLAGS ?= "-X github.com/oasislabs/oasis-core/go/common/version.SoftwareVersion=$(VERSION)" + export GOLDFLAGS ?= "-X github.com/oasislabs/oasis-core/go/common/version.SoftwareVersion=$(VERSION) -X github.com/oasislabs/oasis-core/go/common/version.GitBranch=$(GIT_BRANCH)" endif # Go build command to use by default. diff --git a/go/Makefile b/go/Makefile index 8eeb5906d5c..aa91d9f17e7 100644 --- a/go/Makefile +++ b/go/Makefile @@ -10,7 +10,7 @@ all: build # Build. # List of Go binaries to build. -go-binaries := oasis-node oasis-test-runner oasis-net-runner oasis-remote-signer extra/stats +go-binaries := oasis-node oasis-test-runner oasis-net-runner oasis-remote-signer extra/stats extra/ba # List of test helpers to build. test-helpers := urkel # List of test vectors to generate. diff --git a/go/common/version/version.go b/go/common/version/version.go index 5d87f1a2710..f10076ac76b 100644 --- a/go/common/version/version.go +++ b/go/common/version/version.go @@ -55,6 +55,11 @@ var ( // by the linker. SoftwareVersion = "0.0-unset" + // GitBranch is the name of the git branch of Oasis Core. + // + // This is mostly used for reporting and metrics. + GitBranch = "" + // RuntimeProtocol versions the protocol between the Oasis node(s) and // the runtime. // diff --git a/go/extra/ba/README.md b/go/extra/ba/README.md new file mode 100644 index 00000000000..82aa865a6c2 --- /dev/null +++ b/go/extra/ba/README.md @@ -0,0 +1,10 @@ +# Benchmark analysis + +Connects to prometheus and compares results of benchmark results. + +## Basic usage + +```bash +$ ba/ba cmp \ + --metrics.address http://localhost:9090 +``` diff --git a/go/extra/ba/cmd/ba.go b/go/extra/ba/cmd/ba.go new file mode 100644 index 00000000000..5f9f914826d --- /dev/null +++ b/go/extra/ba/cmd/ba.go @@ -0,0 +1,399 @@ +package cmd + +import ( + "context" + "fmt" + "github.com/pkg/errors" + "github.com/prometheus/client_golang/api" + "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "github.com/spf13/cobra" + flag "github.com/spf13/pflag" + "github.com/spf13/viper" + "os" + "sort" + "time" + + "github.com/oasislabs/oasis-core/go/common/logging" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/metrics" + testCmd "github.com/oasislabs/oasis-core/go/oasis-test-runner/cmd" + testOasis "github.com/oasislabs/oasis-core/go/oasis-test-runner/oasis" +) + +const ( + cfgMetrics = "metrics" + cfgMetricsP = "m" + cfgMetricsTargetGitBranch = "metrics.target.git_branch" + cfgMetricsSourceGitBranch = "metrics.source.git_branch" +) + +var ( + cmpFlags = flag.NewFlagSet("", flag.ContinueOnError) + + cmpCmd = &cobra.Command{ + Use: "cmp", + Short: "compare last two benchmark instances", + Long: `cmp connects to prometheus, fetches the metrics of the benchmark instances and +compares them. By default, the most recent instance (source) is fetched and +compared to the pre-last (target). If --metrics.{target|source}.git_branch is +provided, it compares the most recent instances in the corresponding branches. +cmp compares all metrics provided by -m parameter and computes ratio source/target +of metric values. If any of the metrics exceeds max_threshold..{avg|max}_ratio +or doesn't reach min_threshold..{avg|max}_ratio, ba exits with error code 1.`, + Run: runCmp, + } + + allMetrics = map[string]*Metric{ + "time": &Metric{ + getter: getDuration, + maxThresholdAvgRatio: 1.1, + maxThresholdMaxRatio: 1.1, + }, + "du": &Metric{ + getter: getDiskUsage, + maxThresholdAvgRatio: 1.06, + maxThresholdMaxRatio: 1.06, + }, + "io": &Metric{ + getter: getIOWork, + maxThresholdAvgRatio: 1.2, + maxThresholdMaxRatio: 1.2, + }, + "mem": &Metric{ + getter: getRssAnonMemory, + maxThresholdAvgRatio: 1.1, + maxThresholdMaxRatio: 1.1, + }, + "cpu": &Metric{ + getter: getCPUTime, + maxThresholdAvgRatio: 1.05, + maxThresholdMaxRatio: 1.05, + }, + } + userMetrics []string + + logger = logging.GetLogger("cmd/ba") + client api.Client +) + +type Metric struct { + getter func(context.Context, string, *model.SampleStream) (float64, float64, error) + maxThresholdAvgRatio float64 + maxThresholdMaxRatio float64 + minThresholdAvgRatio float64 + minThresholdMaxRatio float64 +} + +// getDuration returns average and maximum running times of the given coarse benchmark instance ("up" metric w/ minute +// resolution time series). +func getDuration(ctx context.Context, test string, bi *model.SampleStream) (float64, float64, error) { + instance := string(bi.Metric[testOasis.MetricsLabelInstance]) + + // Re-fetch the given benchmark instance with second resolution. Each obtained time series corresponds to one run. + v1api := v1.NewAPI(client) + r := v1.Range{ + Start: bi.Values[0].Timestamp.Time().Add(-1 * time.Minute), + End: bi.Values[len(bi.Values)-1].Timestamp.Time().Add(time.Minute), + Step: time.Second, + } + + query := "up" + bi.Metric.String() + " == 1.0" + result, warnings, err := v1api.QueryRange(ctx, query, r) + if err != nil { + common.EarlyLogAndExit(errors.Wrap(err, "error querying Prometheus")) + } + if len(warnings) > 0 { + logger.Warn("warnings while querying Prometheus", "warnings", warnings) + } + if len(result.(model.Matrix)) == 0 { + return 0, 0, fmt.Errorf("getDuration: no time series matched test: %s and instance: %s", test, instance) + } + // Compute average and max duration of runs. Since we have a second-resolution, each point denotes 1 second of run's + // uptime. Just count all points and divide them by the number of runs. + avgDuration := 0.0 + maxDuration := 0.0 + for _, s := range result.(model.Matrix) { + avgDuration += float64(len(s.Values)) + if maxDuration < float64(len(s.Values)) { + maxDuration = float64(len(s.Values)) + } + } + avgDuration /= float64(len(result.(model.Matrix))) + + return avgDuration, maxDuration, nil +} + +// getIOWork returns average and maximum sum of read and written bytes by all workers of the given coarse benchmark +// instance ("up" metric). +func getIOWork(ctx context.Context, test string, bi *model.SampleStream) (float64, float64, error) { + readAvg, readMax, err := getSummableMetric(ctx, "oasis_worker_disk_read_bytes", test, bi) + if err != nil { + return 0, 0, err + } + writtenAvg, writtenMax, err := getSummableMetric(ctx, "oasis_worker_disk_written_bytes", test, bi) + if err != nil { + return 0, 0, err + } + + return readAvg + writtenAvg, readMax + writtenMax, nil +} + +// getDiskUsage returns average and maximum sum of disk usage for all workers of the given coarse benchmark instance +// ("up" metric). +func getDiskUsage(ctx context.Context, test string, bi *model.SampleStream) (float64, float64, error) { + return getSummableMetric(ctx, "oasis_worker_disk_usage_bytes", test, bi) +} + +// getRssAnonMemory returns average and maximum sum of anonymous resident memory for all workers of the given coarse +// benchmark instance ("up" metric). +func getRssAnonMemory(ctx context.Context, test string, bi *model.SampleStream) (float64, float64, error) { + return getSummableMetric(ctx, "oasis_worker_mem_RssAnon_bytes", test, bi) +} + +// getCPUTime returns average and maximum sum of utime and stime for all workers of the given coarse benchmark instance +// ("up" metric). +func getCPUTime(ctx context.Context, test string, bi *model.SampleStream) (float64, float64, error) { + utimeAvg, utimeMax, err := getSummableMetric(ctx, "oasis_worker_cpu_utime_seconds", test, bi) + if err != nil { + return 0, 0, err + } + stimeAvg, stimeMax, err := getSummableMetric(ctx, "oasis_worker_cpu_stime_seconds", test, bi) + if err != nil { + return 0, 0, err + } + + return utimeAvg + stimeAvg, utimeMax + stimeMax, nil +} + +// getSummableMetric returns average and maximum sum of metrics for all workers of the given coarse benchmark instance +// ("up" metric). +func getSummableMetric(ctx context.Context, metric string, test string, bi *model.SampleStream) (float64, float64, error) { + instance := string(bi.Metric[testOasis.MetricsLabelInstance]) + + labels := bi.Metric.Clone() + // existing job denotes the oasis-test-runner worker. We want to sum disk space across all workers. + delete(labels, "job") + // We will average metric over all runs. + delete(labels, "run") + + v1api := v1.NewAPI(client) + + query := fmt.Sprintf("sum by (run) (%s %s)", metric, labels.String()) + + // Fetch value at last recorded time. Some metrics might not be available anymore, if prometheus was shut down. + t := bi.Values[len(bi.Values)-1].Timestamp.Time() + + result, warnings, err := v1api.Query(ctx, query, t) + if err != nil { + common.EarlyLogAndExit(errors.Wrap(err, "error querying Prometheus")) + } + if len(warnings) > 0 { + logger.Warn("warnings while querying Prometheus", "warnings", warnings) + } + if len(result.(model.Vector)) == 0 { + return 0, 0, fmt.Errorf("getSummableMetric: no time series matched test: %s and instance: %s", test, instance) + } + + // Compute average and max values. + avg := 0.0 + max := 0.0 + for _, s := range result.(model.Vector) { + avg += float64(s.Value) + if max < float64(s.Value) { + max = float64(s.Value) + } + } + avg /= float64(len(result.(model.Vector))) + + return avg, max, nil +} + +// getCoarseBenchmarkInstances finds time series based on "up" metric w/ minute resolution for the given test and gitBranch +// ordered from the oldest to the most recent ones. +// +// This function is usually called to determine test instance pairs to compare with more fine granularity and specific +// metric afterwards. +// +// NB: Due to Prometheus limit, this function fetches time series in the past 183 hours only. +func getCoarseBenchmarkInstances(ctx context.Context, test string, gitBranch string) (model.Matrix, error) { + v1api := v1.NewAPI(client) + r := v1.Range{ + // XXX: Hardcoded max potential number of points in Prometheus is 11,000 which equals ~183 hours with minute resolution. + Start: time.Now().Add(-183 * time.Hour), + End: time.Now(), + Step: time.Minute, + } + + labels := model.LabelSet{ + "job": testOasis.MetricsJobName, + testOasis.MetricsLabelTest: model.LabelValue(test), + } + if gitBranch != "" { + labels[testOasis.MetricsLabelGitBranch] = model.LabelValue(gitBranch) + } + + query := "max(up" + labels.String() + ") by (" + testOasis.MetricsLabelInstance + ") == 1.0" + result, warnings, err := v1api.QueryRange(ctx, query, r) + if err != nil { + logger.Error("error querying Prometheus", "err", err) + os.Exit(1) + } + if len(warnings) > 0 { + logger.Warn("warnings while querying Prometheus", "warnings", warnings) + } + + // Go through all obtained time series and order them by the timestamp of the first sample. + sort.Slice(result.(model.Matrix), func(i, j int) bool { + return result.(model.Matrix)[i].Values[0].Timestamp < result.(model.Matrix)[j].Values[0].Timestamp + }) + return result.(model.Matrix), nil +} + +// instanceNames extracts instance names from given Prometheus time series matrix. +func instanceNames(ts model.Matrix) []string { + var names []string + for _, t := range ts { + names = append(names, instanceName(t)) + } + return names +} + +// instanceName returns the instance name label of the given sample. +func instanceName(s *model.SampleStream) string { + return string(s.Metric[testOasis.MetricsLabelInstance]) +} + +// fetchAndCompare fetches the given metric from prometheus and compares the results. +// +// Returns false, if metric-specific ratios are exceeded or there is a problem obtaining time series. Otherwise true. +func fetchAndCompare(ctx context.Context, m string, test string, sInstance *model.SampleStream, tInstance *model.SampleStream) (succ bool) { + getMetric := allMetrics[m].getter + succ = true + + sAvg, sMax, err := getMetric(ctx, test, sInstance) + if err != nil { + logger.Error("error fetching source benchmark instance", "metric", m, "test", test, "instance", instanceName(sInstance), "err", err) + return false + } + + tAvg, tMax, err := getMetric(ctx, test, tInstance) + if err != nil { + logger.Error("error fetching target test instance", "metric", m, "test", test, "instance", instanceName(sInstance), "err", err) + return false + } + + // Compare average and max metric values and log error, if they exceed or don't reach required ratios. + maxAvgRatio := allMetrics[m].maxThresholdAvgRatio + maxMaxRatio := allMetrics[m].maxThresholdMaxRatio + minAvgRatio := allMetrics[m].minThresholdAvgRatio + minMaxRatio := allMetrics[m].minThresholdMaxRatio + logger.Info("obtained average ratio", "metric", m, "test", test, "source_avg", sAvg, "target_avg", tAvg, "ratio", sAvg/tAvg) + if maxAvgRatio != 0 && sAvg/tAvg > maxAvgRatio { + logger.Error("average metric value exceeds max allowed ratio", "metric", m, "test", test, "source_avg", sAvg, "target_avg", tAvg, "ratio", sAvg/tAvg, "max_allowed_avg_ratio", maxAvgRatio) + succ = false + } + if minAvgRatio != 0 && sAvg/tAvg < minAvgRatio { + logger.Error("average metric value doesn't reach min required ratio", "metric", m, "test", test, "source_avg", sAvg, "target_avg", tAvg, "ratio", sAvg/tAvg, "min_required_avg_ratio", minAvgRatio) + succ = false + } + logger.Info("obtained max ratio", "metric", m, "test", test, "source_max", sMax, "target_max", tMax, "ratio", sMax/tMax) + if maxMaxRatio != 0 && sMax/tMax > maxMaxRatio { + logger.Error("maximum metric value exceeds max ratio", "metric", m, "test", test, "source_max", sMax, "target_max", tMax, "ratio", sMax/tMax, "max_allowed_max_ratio", maxMaxRatio) + succ = false + } + if minMaxRatio != 0 && sMax/tMax < maxMaxRatio { + logger.Error("maximum metric value doesn't reach min required ratio", "metric", m, "test", test, "source_max", sMax, "target_max", tMax, "ratio", sMax/tMax, "min_required_max_ratio", maxMaxRatio) + succ = false + } + + return +} + +func runCmp(cmd *cobra.Command, args []string) { + var err error + client, err = api.NewClient(api.Config{ + Address: viper.GetString(metrics.CfgMetricsAddr), + }) + if err != nil { + logger.Error("error creating client", "err", err) + os.Exit(1) + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + + succ := true + for _, test := range viper.GetStringSlice(testCmd.CfgTest) { + sInstances, err := getCoarseBenchmarkInstances(ctx, test, viper.GetString(cfgMetricsSourceGitBranch)) + if err != nil { + logger.Error("error querying for source test instances", "err", err) + os.Exit(1) + } + sNames := instanceNames(sInstances) + tInstances, err := getCoarseBenchmarkInstances(ctx, test, viper.GetString(cfgMetricsTargetGitBranch)) + if err != nil { + logger.Error("error querying for target test instances", "err", err) + os.Exit(1) + } + tNames := instanceNames(tInstances) + + if len(sNames) == 0 { + logger.Info("test does not have any source benchmark instances to compare, ignoring", "test", test) + continue + } + if len(tNames) == 0 { + logger.Info("test does not have any target benchmark instances to compare, ignoring", "test", test) + continue + } + + var sInstance, tInstance *model.SampleStream + if sNames[len(sNames)-1] != tNames[len(tNames)-1] { + // Benchmark instances differ e.g. because of different gitBranch. + sInstance = sInstances[len(sInstances)-1] + tInstance = tInstances[len(tInstances)-1] + } else { + // Last benchmark instances are equal, pick the pre-last one from the target instances. + if len(tNames) < 2 { + logger.Info("test has only one benchmark instance, ignoring", "test", test, "source_instances", sNames, "target_instances", tNames) + continue + } + sInstance = sInstances[len(sInstances)-1] + tInstance = tInstances[len(tInstances)-2] + } + logger.Info("obtained source and target instance", "test", test, "source_instance", instanceName(sInstance), "target_instance", instanceName(tInstance)) + + for _, m := range userMetrics { + // Don't put succ = succ && f oneliner here, because f won't get executed once succ = false. + fSucc := fetchAndCompare(ctx, m, test, sInstance, tInstance) + succ = succ && fSucc + } + } + + if !succ { + os.Exit(1) + } + + defer cancel() +} + +// Register ba cmd sub-command and all of it's children. +func RegisterBaCmd(parentCmd *cobra.Command) { + var metricNames []string + for k := range allMetrics { + metricNames = append(metricNames, k) + cmpFlags.Float64Var(&allMetrics[k].maxThresholdAvgRatio, fmt.Sprintf("max_threshold.%s.avg_ratio", k), allMetrics[k].maxThresholdAvgRatio, fmt.Sprintf("maximum allowed ratio between average %s metrics", k)) + cmpFlags.Float64Var(&allMetrics[k].maxThresholdMaxRatio, fmt.Sprintf("max_threshold.%s.max_ratio", k), allMetrics[k].maxThresholdMaxRatio, fmt.Sprintf("maximum allowed ratio between maximum %s metrics", k)) + cmpFlags.Float64Var(&allMetrics[k].minThresholdAvgRatio, fmt.Sprintf("min_threshold.%s.avg_ratio", k), allMetrics[k].minThresholdAvgRatio, fmt.Sprintf("minimum required ratio between average %s metrics", k)) + cmpFlags.Float64Var(&allMetrics[k].minThresholdMaxRatio, fmt.Sprintf("min_threshold.%s.max_ratio", k), allMetrics[k].minThresholdMaxRatio, fmt.Sprintf("minimum required ratio between maximum %s metrics", k)) + } + + cmpFlags.String(cfgMetricsSourceGitBranch, "", "(optional) git_branch label for the source benchmark instance") + cmpFlags.String(cfgMetricsTargetGitBranch, "", "(optional) git_branch label for the target benchmark instance") + cmpFlags.StringSliceP(testCmd.CfgTest, testCmd.CfgTestP, nil, "name of e2e test(s) to process") + cmpFlags.StringSliceVarP(&userMetrics, cfgMetrics, cfgMetricsP, metricNames, "metrics to compare") + _ = viper.BindPFlags(cmpFlags) + + cmpCmd.Flags().AddFlagSet(cmpFlags) + _ = cmpCmd.MarkFlagRequired(testCmd.CfgTest) + + parentCmd.AddCommand(cmpCmd) +} diff --git a/go/extra/ba/cmd/root.go b/go/extra/ba/cmd/root.go new file mode 100644 index 00000000000..555b1eb2b80 --- /dev/null +++ b/go/extra/ba/cmd/root.go @@ -0,0 +1,49 @@ +// Package cmd implements ba cmd tool. +package cmd + +import ( + "fmt" + "github.com/oasislabs/oasis-core/go/common/logging" + "github.com/spf13/cobra" + flag "github.com/spf13/pflag" + "github.com/spf13/viper" + "os" + + "github.com/oasislabs/oasis-core/go/common/version" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/metrics" +) + +var ( + rootCmd = &cobra.Command{ + Use: "ba", + Short: "Benchmark analysis", + Version: version.SoftwareVersion, + } + + rootFlags = flag.NewFlagSet("", flag.ContinueOnError) +) + +// RootCommand returns the root (top level) cobra.Command. +func RootCommand() *cobra.Command { + return rootCmd +} + +// Execute spawns the main entry point after handling the command line arguments. +func Execute() { + if err := rootCmd.Execute(); err != nil { + common.EarlyLogAndExit(err) + } +} + +func init() { + rootFlags.String(metrics.CfgMetricsAddr, "http://localhost:9090", "Prometheus query address") + _ = viper.BindPFlags(rootFlags) + rootCmd.PersistentFlags().AddFlagSet(rootFlags) + + if err := logging.Initialize(os.Stdout, logging.FmtJSON, logging.LevelInfo, nil); err != nil { + fmt.Println(fmt.Errorf("root: failed to initialize logging: %w", err)) + } + // Register all of the sub-commands. + RegisterBaCmd(rootCmd) +} diff --git a/go/extra/ba/main.go b/go/extra/ba/main.go new file mode 100644 index 00000000000..df385baac2e --- /dev/null +++ b/go/extra/ba/main.go @@ -0,0 +1,10 @@ +// Benchmark analysis implementation. +package main + +import ( + "github.com/oasislabs/oasis-core/go/extra/ba/cmd" +) + +func main() { + cmd.Execute() +} diff --git a/go/go.mod b/go/go.mod index 66480e921b5..5e49b8f0cf2 100644 --- a/go/go.mod +++ b/go/go.mod @@ -44,7 +44,9 @@ require ( github.com/opentracing/opentracing-go v1.1.0 github.com/pelletier/go-toml v1.4.0 // indirect github.com/pkg/errors v0.8.1 - github.com/prometheus/client_golang v0.9.4 + github.com/prometheus/client_golang v1.4.1 + github.com/prometheus/common v0.9.1 + github.com/prometheus/procfs v0.0.10 github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a // indirect github.com/remyoudompheng/bigfft v0.0.0-20190512091148-babf20351dd7 // indirect github.com/seccomp/libseccomp-golang v0.9.1 @@ -71,7 +73,6 @@ require ( gitlab.com/yawning/dynlib.git v0.0.0-20190911075527-1e6ab3739fd8 golang.org/x/crypto v0.0.0-20191119213627-4f8c1d86b1ba golang.org/x/net v0.0.0-20190628185345-da137c7871d7 - golang.org/x/sys v0.0.0-20190912141932-bc967efca4b8 // indirect golang.org/x/text v0.3.2 // indirect google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55 google.golang.org/grpc v1.25.1 diff --git a/go/go.sum b/go/go.sum index a734223bd09..240f77a4e29 100644 --- a/go/go.sum +++ b/go/go.sum @@ -15,11 +15,15 @@ github.com/VividCortex/gohistogram v1.0.0/go.mod h1:Pf5mBqqDxYaXu3hDrrU+w6nw50o/ github.com/Workiva/go-datastructures v1.0.50/go.mod h1:Z+F2Rca0qCsVYDS8z7bAGm8f3UkzuWYS/oBZz5a7VVA= github.com/aead/siphash v1.0.1/go.mod h1:Nywa3cDsYNNK3gaciGTWPwHt0wlpNV15vwmswBAUSII= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blevesearch/bleve v0.8.0 h1:DCoCrxscCXrlzVWK92k7Vq4d28lTAFuigVmcgIX0VCo= github.com/blevesearch/bleve v0.8.0/go.mod h1:Y2lmIkzV6mcNfAnAdOd+ZxHkHchhBfU/xroGIp61wfw= github.com/blevesearch/blevex v0.0.0-20180227211930-4b158bb555a3 h1:U6vnxZrTfItfiUiYx0lf/LgHjRSfaKK5QHSom3lEbnA= @@ -48,6 +52,8 @@ github.com/cenkalti/backoff/v4 v4.0.0/go.mod h1:eEew/i+1Q6OrCDZh3WiXYv3+nJwBASZ8 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= +github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= +github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd h1:qMd81Ts1T2OTKmB4acZcyKaMtRnY5Y44NuXGX2GFJ1w= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= @@ -145,6 +151,8 @@ github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEW github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/go-cmp v0.2.0 h1:+dTQ8DZQJz0Mb/HjFlkptS1FeQ4cWSnN941F8aEG4SQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/gofuzz v0.0.0-20170612174753-24818f796faf/go.mod h1:HP5RmnzzSNb993RKQDq4+1A4ia9nllfqcQFTQJedwGI= github.com/google/gofuzz v1.0.0 h1:A8PeW59pxE9IoFRqBp37U+mSNaQoZ46F1f0f863XSXw= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -204,6 +212,8 @@ github.com/jmhodges/levigo v1.0.0/go.mod h1:Q6Qx+uH3RAqyK4rFQroq9RL7mdkABMcfhEI+ github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= +github.com/json-iterator/go v1.1.9 h1:9yzud/Ht36ygwatGx56VwCZtlI/2AD15T1X2sjSuGns= +github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= @@ -315,7 +325,11 @@ github.com/minio/sha256-simd v0.1.0/go.mod h1:2FMWW+8GMoPweT6+pI63m9YE3Lmw4J71hV github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/mr-tron/base58 v1.1.0/go.mod h1:xcD2VGqlgYjBdcBLw+TuYLr8afG+Hj8g2eTVqeSzSU8= github.com/mr-tron/base58 v1.1.1/go.mod h1:xcD2VGqlgYjBdcBLw+TuYLr8afG+Hj8g2eTVqeSzSU8= @@ -380,19 +394,30 @@ github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXP github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= github.com/prometheus/client_golang v0.9.4 h1:Y8E/JaaPbmFSW2V81Ab/d8yZFYQQGbni1b1jPcG9Y6A= github.com/prometheus/client_golang v0.9.4/go.mod h1:oCXIBxdI62A4cR6aTRJCgetEjecSIYzOEaeAn4iYEpM= +github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= +github.com/prometheus/client_golang v1.4.1 h1:FFSuS004yOQEtDdTq+TAOLP5xUq63KqAFYyOi8zA+Y8= +github.com/prometheus/client_golang v1.4.1/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90 h1:S/YWwWx/RA8rT8tKFRuGUZhuA90OyIBpPCXkcbwU8DE= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4 h1:gQz4mCbXsO+nc9n1hCxHcGA3Zx3Eo+UHZoInFGUIXNM= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M= +github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.4.1 h1:K0MGApIoQvMw27RTdJkPbr3JZ7DNbtxQNyi5STVM6Kw= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= +github.com/prometheus/common v0.9.1 h1:KOMtN28tlbam3/7ZKEYKHhKoJZYYj3gMH4uc62x7X7U= +github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= +github.com/prometheus/procfs v0.0.8 h1:+fpWZdT24pJBiqJdAwYBjPSk+5YmQzYNPYzQsdzLkt8= +github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= +github.com/prometheus/procfs v0.0.10 h1:QJQN3jYQhkamO4mhfUWqdDH2asK7ONOI9MTWjyAxNKM= +github.com/prometheus/procfs v0.0.10/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rcrowley/go-metrics v0.0.0-20180503174638-e2704e165165/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a h1:9ZKAASQSHhDYGoxY8uLVpewe1GDZ2vu2Tr/vTdVAkFQ= @@ -406,6 +431,7 @@ github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR github.com/seccomp/libseccomp-golang v0.9.1 h1:NJjM5DNFOs0s3kYE1WUOr6G8V97sdt46rlXTMfXGWBo= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= +github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/assertions v1.0.0 h1:UVQPSSmc3qtTi+zPPkCXvZX9VvW/xT/NsRvKfwY81a8= github.com/smartystreets/assertions v1.0.0/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM= @@ -536,6 +562,7 @@ golang.org/x/net v0.0.0-20190227160552-c95aed5357e7/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190628185345-da137c7871d7 h1:rTIdg5QFRR7XCaK4LCjBiPbx8j4DQRpdYMnGn/bJUEU= golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -545,6 +572,7 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -556,11 +584,14 @@ golang.org/x/sys v0.0.0-20190219092855-153ac476189d/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190228124157-a34e9553db1e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190804053845-51ab0e2deafa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190912141932-bc967efca4b8 h1:41hwlulw1prEMBxLQSlMSux1zxJf07B3WPsdjJlKZxE= golang.org/x/sys v0.0.0-20190912141932-bc967efca4b8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82 h1:ywK/j/KkyTHcdyYSZNXGjMwgmDSfjglYZ3vStQ/gSCU= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= @@ -573,6 +604,7 @@ golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3 golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= @@ -603,5 +635,7 @@ gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4 h1:/eiJrUcujPVeJ3xlSWaiNi3uSVmDGBK1pDHUHAnao1I= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.5 h1:ymVxjfMaHvXD8RqPRmzHHsB3VvucivSkIAvJFDI5O3c= +gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/go/oasis-net-runner/cmd/root.go b/go/oasis-net-runner/cmd/root.go index f510be912d0..c4beef0ad17 100644 --- a/go/oasis-net-runner/cmd/root.go +++ b/go/oasis-net-runner/cmd/root.go @@ -110,7 +110,7 @@ func runRoot(cmd *cobra.Command, args []string) error { defer rootEnv.Cleanup() logger := logging.GetLogger("net-runner") - childEnv, err := rootEnv.NewChild("net-runner") + childEnv, err := rootEnv.NewChild("net-runner", env.TestInstanceInfo{}) if err != nil { logger.Error("failed to setup child environment", "err", err, diff --git a/go/oasis-node/cmd/common/metrics/metrics.go b/go/oasis-node/cmd/common/metrics/metrics.go index 664dfaa7b2e..f21f10fb8c7 100644 --- a/go/oasis-node/cmd/common/metrics/metrics.go +++ b/go/oasis-node/cmd/common/metrics/metrics.go @@ -6,32 +6,63 @@ import ( "fmt" "net" "net/http" + "os" + "path/filepath" "strings" + "sync" "time" + "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/push" + "github.com/prometheus/procfs" flag "github.com/spf13/pflag" "github.com/spf13/viper" "github.com/oasislabs/oasis-core/go/common/service" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common" ) const ( - cfgMetricsMode = "metrics.mode" - cfgMetricsAddr = "metrics.address" - cfgMetricsPushJobName = "metrics.push.job_name" - cfgMetricsPushInstanceLabel = "metrics.push.instance_label" - cfgMetricsPushInterval = "metrics.push.interval" - - metricsModeNone = "none" - metricsModePull = "pull" - metricsModePush = "push" + CfgMetricsMode = "metrics.mode" + CfgMetricsAddr = "metrics.address" + CfgMetricsPushJobName = "metrics.push.job_name" + CfgMetricsPushLabels = "metrics.push.labels" + CfgMetricsPushInterval = "metrics.push.interval" + + MetricsModeNone = "none" + MetricsModePull = "pull" + MetricsModePush = "push" + + // getconf CLK_TCK + ClockTicks = 100 ) -// Flags has the flags used by the metrics service. -var Flags = flag.NewFlagSet("", flag.ContinueOnError) +var ( + // Flags has the flags used by the metrics service. + Flags = flag.NewFlagSet("", flag.ContinueOnError) + + diskServiceOnce sync.Once + memServiceOnce sync.Once + cpuServiceOnce sync.Once +) + +// ParseMetricPushLabes is a drop-in replacement for viper.GetStringMapString due to https://github.com/spf13/viper/issues/608. +func ParseMetricPushLabels(val string) map[string]string { + // viper.GetString() wraps the string inside [] parenthesis, unwrap it. + val = val[1 : len(val)-1] + + labels := map[string]string{} + for _, lPair := range strings.Split(val, ",") { + kv := strings.Split(lPair, "=") + if len(kv) != 2 || kv[0] == "" { + continue + } + labels[kv[0]] = kv[1] + } + return labels +} // ServiceConfig contains the configuration parameters for metrics. type ServiceConfig struct { @@ -41,8 +72,8 @@ type ServiceConfig struct { Address string // JobName is the name of the job for which metrics are collected. JobName string - // InstanceLabel is the instance label of the job being collected for. - InstanceLabel string + // Labels are the key-value labels of the job being collected for. + Labels map[string]string // Interval defined the push interval for metrics collection. Interval time.Duration } @@ -50,19 +81,27 @@ type ServiceConfig struct { // GetServiceConfig gets the metrics configuration parameter struct. func GetServiceConfig() *ServiceConfig { return &ServiceConfig{ - Mode: viper.GetString(cfgMetricsMode), - Address: viper.GetString(cfgMetricsAddr), - JobName: viper.GetString(cfgMetricsPushJobName), - InstanceLabel: viper.GetString(cfgMetricsPushInstanceLabel), - Interval: viper.GetDuration(cfgMetricsPushInterval), + Mode: viper.GetString(CfgMetricsMode), + Address: viper.GetString(CfgMetricsAddr), + JobName: viper.GetString(CfgMetricsPushJobName), + Labels: ParseMetricPushLabels(viper.GetString(CfgMetricsPushLabels)), + Interval: viper.GetDuration(CfgMetricsPushInterval), } } type stubService struct { service.BaseBackgroundService + + dService *diskService + mService *memService + cService *cpuService } func (s *stubService) Start() error { + if err := s.dService.Start(); err != nil { + return err + } + return nil } @@ -73,8 +112,26 @@ func (s *stubService) Cleanup() {} func newStubService() (service.BackgroundService, error) { svc := *service.NewBaseBackgroundService("metrics") + d, err := NewDiskService() + if err != nil { + return nil, err + } + + m, err := NewMemService() + if err != nil { + return nil, err + } + + c, err := NewCPUService() + if err != nil { + return nil, err + } + return &stubService{ BaseBackgroundService: svc, + dService: d.(*diskService), + mService: m.(*memService), + cService: c.(*cpuService), }, nil } @@ -86,9 +143,23 @@ type pullService struct { ctx context.Context errCh chan error + + dService *diskService + mService *memService + cService *cpuService } func (s *pullService) Start() error { + if err := s.dService.Start(); err != nil { + return err + } + if err := s.mService.Start(); err != nil { + return err + } + if err := s.cService.Start(); err != nil { + return err + } + go func() { if err := s.s.Serve(s.ln); err != nil { s.BaseBackgroundService.Stop() @@ -122,12 +193,12 @@ func (s *pullService) Cleanup() { } func newPullService(ctx context.Context) (service.BackgroundService, error) { - addr := viper.GetString(cfgMetricsAddr) + addr := viper.GetString(CfgMetricsAddr) svc := *service.NewBaseBackgroundService("metrics") svc.Logger.Debug("Metrics Server Params", - "mode", metricsModePull, + "mode", MetricsModePull, "addr", addr, ) @@ -136,12 +207,30 @@ func newPullService(ctx context.Context) (service.BackgroundService, error) { return nil, err } + d, err := NewDiskService() + if err != nil { + return nil, err + } + + m, err := NewMemService() + if err != nil { + return nil, err + } + + c, err := NewCPUService() + if err != nil { + return nil, err + } + return &pullService{ BaseBackgroundService: svc, ctx: ctx, ln: ln, s: &http.Server{Handler: promhttp.Handler()}, errCh: make(chan error), + dService: d.(*diskService), + mService: m.(*memService), + cService: c.(*cpuService), }, nil } @@ -150,9 +239,23 @@ type pushService struct { pusher *push.Pusher interval time.Duration + + dService *diskService + mService *memService + cService *cpuService } func (s *pushService) Start() error { + if err := s.dService.Start(); err != nil { + return err + } + if err := s.mService.Start(); err != nil { + return err + } + if err := s.cService.Start(); err != nil { + return err + } + s.pusher = s.pusher.Gatherer(prometheus.DefaultGatherer) go s.worker() @@ -179,56 +282,420 @@ func (s *pushService) worker() { } func newPushService() (service.BackgroundService, error) { - addr := viper.GetString(cfgMetricsAddr) - jobName := viper.GetString(cfgMetricsPushJobName) - instanceLabel := viper.GetString(cfgMetricsPushInstanceLabel) - interval := viper.GetDuration(cfgMetricsPushInterval) + addr := viper.GetString(CfgMetricsAddr) + jobName := viper.GetString(CfgMetricsPushJobName) + labels := ParseMetricPushLabels(viper.GetString(CfgMetricsPushLabels)) + interval := viper.GetDuration(CfgMetricsPushInterval) if jobName == "" { return nil, fmt.Errorf("metrics: metrics.push.job_name required for push mode") } - if instanceLabel == "" { - return nil, fmt.Errorf("metrics: metrics.push.instance_label required for push mode") + if labels["instance"] == "" { + return nil, fmt.Errorf("metrics: at least 'instance' key should be set for metrics.push.labels. Provided labels: %v, viper raw: %v", labels, viper.GetString(CfgMetricsPushLabels)) } svc := *service.NewBaseBackgroundService("metrics") svc.Logger.Debug("Metrics Server Params", - "mode", metricsModePush, + "mode", MetricsModePush, "addr", addr, "job_name", jobName, - "instance_label", instanceLabel, + "labels", labels, "push_interval", interval, ) + pusher := push.New(addr, jobName) + for k, v := range labels { + pusher = pusher.Grouping(k, v) + } + + d, err := NewDiskService() + if err != nil { + return nil, err + } + + m, err := NewMemService() + if err != nil { + return nil, err + } + + c, err := NewCPUService() + if err != nil { + return nil, err + } + return &pushService{ BaseBackgroundService: svc, - pusher: push.New(addr, jobName).Grouping("instance", instanceLabel), + pusher: pusher, interval: interval, + dService: d.(*diskService), + mService: m.(*memService), + cService: c.(*cpuService), }, nil } // New constructs a new metrics service. func New(ctx context.Context) (service.BackgroundService, error) { - mode := viper.GetString(cfgMetricsMode) + mode := viper.GetString(CfgMetricsMode) switch strings.ToLower(mode) { - case metricsModeNone: + case MetricsModeNone: return newStubService() - case metricsModePull: + case MetricsModePull: return newPullService(ctx) - case metricsModePush: + case MetricsModePush: return newPushService() default: return nil, fmt.Errorf("metrics: unsupported mode: '%v'", mode) } } +type diskService struct { + service.BaseBackgroundService + sync.Mutex + + dataDir string + // TODO: Should we monitor I/O of children PIDs as well? + pid int + interval time.Duration + + diskUsageGauge prometheus.Gauge + diskIOReadBytesGauge prometheus.Gauge + diskIOWrittenBytesGauge prometheus.Gauge +} + +func (d *diskService) Start() error { + go d.worker() + return nil +} + +// updateDiskUsage walks through dataDir and updates diskUsageGauge attribute. +func (d *diskService) updateDiskUsage() error { + d.Lock() + defer d.Unlock() + + // Compute disk usage of datadir. + var duBytes int64 + err := filepath.Walk(d.dataDir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return errors.Wrap(err, fmt.Sprintf("disk usage metric: failed to access file %s", path)) + } + duBytes += info.Size() + return nil + }) + if err != nil { + return errors.Wrap(err, fmt.Sprintf("disk usage metric: failed to walk directory %s", d.dataDir)) + } + d.diskUsageGauge.Set(float64(duBytes)) + + return nil +} + +// updateDiskUsage reads process info and updates diskIO{Read|Written}BytesGauge attributes. +func (d *diskService) updateIO() error { + d.Lock() + defer d.Unlock() + + // Obtain process I/O info. + proc, err := procfs.NewProc(d.pid) + if err != nil { + return errors.Wrap(err, fmt.Sprintf("disk I/O metric: failed to obtain proc object for PID %d", d.pid)) + } + procIO, err := proc.IO() + if err != nil { + return errors.Wrap(err, fmt.Sprintf("disk I/O metric: failed to obtain procIO object %d", d.pid)) + } + + d.diskIOWrittenBytesGauge.Set(float64(procIO.ReadBytes)) + d.diskIOReadBytesGauge.Set(float64(procIO.WriteBytes)) + + return nil +} + +func (d *diskService) worker() { + t := time.NewTicker(d.interval) + defer t.Stop() + + for { + select { + case <-d.Quit(): + break + case <-t.C: + } + + if err := d.updateDiskUsage(); err != nil { + d.Logger.Warn(err.Error()) + } + + if err := d.updateIO(); err != nil { + d.Logger.Warn(err.Error()) + } + } +} + +// NewDiskService constructs a new disk usage and I/O service. +// +// This service will compute the size of datadir folder and read I/O info of the process every --metric.push.interval +// seconds. +func NewDiskService() (service.BackgroundService, error) { + ds := &diskService{ + BaseBackgroundService: *service.NewBaseBackgroundService("disk"), + dataDir: viper.GetString(common.CfgDataDir), + pid: os.Getpid(), + interval: viper.GetDuration(CfgMetricsPushInterval), + + diskUsageGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_disk_usage_bytes", + Help: "Size of datadir of the worker", + }, + ), + + diskIOReadBytesGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_disk_read_bytes", + Help: "Read bytes by the worker", + }, + ), + + diskIOWrittenBytesGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_disk_written_bytes", + Help: "Written bytes by the worker", + }, + ), + } + + diskCollectors := []prometheus.Collector{ + ds.diskUsageGauge, + ds.diskIOReadBytesGauge, + ds.diskIOWrittenBytesGauge, + } + + // Disk metrics are singletons per process. Ensure to register them only once. + diskServiceOnce.Do(func() { + prometheus.MustRegister(diskCollectors...) + }) + + return ds, nil +} + +type memService struct { + service.BaseBackgroundService + sync.Mutex + + // TODO: Should we monitor memory of children PIDs as well? + pid int + interval time.Duration + + VmSizeGauge prometheus.Gauge // nolint: golint + RssAnonGauge prometheus.Gauge + RssFileGauge prometheus.Gauge + RssShmemGauge prometheus.Gauge +} + +func (m *memService) Start() error { + go m.worker() + return nil +} + +// updateMemory updates current memory usage attributes. +func (m *memService) updateMemory() error { + m.Lock() + defer m.Unlock() + + /// Obtain process Memory info. + proc, err := procfs.NewProc(m.pid) + if err != nil { + return errors.Wrap(err, fmt.Sprintf("memory metric: failed to obtain proc object for PID %d", m.pid)) + } + procStatus, err := proc.NewStatus() + if err != nil { + return errors.Wrap(err, fmt.Sprintf("memory metric: failed to obtain procStatus object %d", m.pid)) + } + + m.VmSizeGauge.Set(float64(procStatus.VmSize)) + m.RssAnonGauge.Set(float64(procStatus.RssAnon)) + m.RssFileGauge.Set(float64(procStatus.RssFile)) + m.RssShmemGauge.Set(float64(procStatus.RssShmem)) + + return nil +} + +func (m *memService) worker() { + t := time.NewTicker(m.interval) + defer t.Stop() + + for { + select { + case <-m.Quit(): + break + case <-t.C: + } + + if err := m.updateMemory(); err != nil { + m.Logger.Warn(err.Error()) + } + } +} + +// NewMemService constructs a new memory usage service. +// +// This service will read memory info from process Status file every --metric.push.interval +// seconds. +func NewMemService() (service.BackgroundService, error) { + ms := &memService{ + BaseBackgroundService: *service.NewBaseBackgroundService("mem"), + pid: os.Getpid(), + interval: viper.GetDuration(CfgMetricsPushInterval), + + VmSizeGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_mem_VmSize_bytes", + Help: "Virtual memory size of worker", + }, + ), + + RssAnonGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_mem_RssAnon_bytes", + Help: "Size of resident anonymous memory of worker", + }, + ), + + RssFileGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_mem_RssFile_bytes", + Help: "Size of resident file mappings of worker", + }, + ), + + RssShmemGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_mem_RssShmem_bytes", + Help: "Size of resident shared memory of worker", + }, + ), + } + + memCollectors := []prometheus.Collector{ + ms.VmSizeGauge, + ms.RssAnonGauge, + ms.RssFileGauge, + ms.RssShmemGauge, + } + + // Memory metrics are singletons per process. Ensure to register them only once. + memServiceOnce.Do(func() { + prometheus.MustRegister(memCollectors...) + }) + + return ms, nil +} + +type cpuService struct { + service.BaseBackgroundService + sync.Mutex + + // TODO: Should we monitor memory of children PIDs as well? + pid int + interval time.Duration + + utimeGauge prometheus.Gauge + stimeGauge prometheus.Gauge +} + +func (c *cpuService) Start() error { + go c.worker() + return nil +} + +// updateCPU updates current memory usage attributes. +func (c *cpuService) updateCPU() error { + c.Lock() + defer c.Unlock() + + /// Obtain process CPU info. + proc, err := procfs.NewProc(c.pid) + if err != nil { + return errors.Wrap(err, fmt.Sprintf("CPU metric: failed to obtain proc object for PID %d", c.pid)) + } + procStat, err := proc.Stat() + if err != nil { + return errors.Wrap(err, fmt.Sprintf("CPU metric: failed to obtain procStat object %d", c.pid)) + } + + c.utimeGauge.Set(float64(procStat.UTime) / float64(ClockTicks)) + c.stimeGauge.Set(float64(procStat.STime) / float64(ClockTicks)) + + return nil +} + +func (c *cpuService) worker() { + t := time.NewTicker(c.interval) + defer t.Stop() + + for { + select { + case <-c.Quit(): + break + case <-t.C: + } + + if err := c.updateCPU(); err != nil { + c.Logger.Warn(err.Error()) + } + } +} + +// NewCPUService constructs a new memory usage service. +// +// This service will read CPU spent time info from process Stat file every +// --metric.push.interval seconds. +func NewCPUService() (service.BackgroundService, error) { + cs := &cpuService{ + BaseBackgroundService: *service.NewBaseBackgroundService("cpu"), + pid: os.Getpid(), + interval: viper.GetDuration(CfgMetricsPushInterval), + + utimeGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_cpu_utime_seconds", + Help: "CPU user time spent by worker", + }, + ), + + stimeGauge: prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "oasis_worker_cpu_stime_seconds", + Help: "CPU system time spent by worker", + }, + ), + } + + cpuCollectors := []prometheus.Collector{ + cs.utimeGauge, + cs.stimeGauge, + } + + // CPU metrics are singletons per process. Ensure to register them only once. + cpuServiceOnce.Do(func() { + prometheus.MustRegister(cpuCollectors...) + }) + + return cs, nil +} + +// EscapeLabelCharacters replaces invalid prometheus label name characters with "_". +func EscapeLabelCharacters(l string) string { + return strings.Replace(l, ".", "_", -1) +} + func init() { - Flags.String(cfgMetricsMode, metricsModeNone, "metrics (prometheus) mode") - Flags.String(cfgMetricsAddr, "127.0.0.1:3000", "metrics pull/push address") - Flags.String(cfgMetricsPushJobName, "", "metrics push job name") - Flags.String(cfgMetricsPushInstanceLabel, "", "metrics push instance label") - Flags.Duration(cfgMetricsPushInterval, 5*time.Second, "metrics push interval") + Flags.String(CfgMetricsMode, MetricsModeNone, "metrics mode: none, pull, push") + Flags.String(CfgMetricsAddr, "127.0.0.1:3000", "metrics pull/push address") + Flags.String(CfgMetricsPushJobName, "", "metrics push job name") + Flags.StringToString(CfgMetricsPushLabels, map[string]string{}, "metrics push instance label") + Flags.Duration(CfgMetricsPushInterval, 5*time.Second, "metrics push interval") _ = viper.BindPFlags(Flags) } diff --git a/go/oasis-node/cmd/common/pprof/pprof.go b/go/oasis-node/cmd/common/pprof/pprof.go index a3933fa7e81..64509a98574 100644 --- a/go/oasis-node/cmd/common/pprof/pprof.go +++ b/go/oasis-node/cmd/common/pprof/pprof.go @@ -6,7 +6,12 @@ import ( "net" "net/http" "net/http/pprof" + "os" + "runtime" + runtimePprof "runtime/pprof" + "strconv" + "github.com/pkg/errors" flag "github.com/spf13/pflag" "github.com/spf13/viper" @@ -30,6 +35,33 @@ type pprofService struct { errCh chan error } +/// Writes the current process heap to given file with unique suffix. +func WriteHeap(name string) error { + // Find unique filename. + var filename string + i := 1 + for { + filename = name + ".prof." + strconv.Itoa(i) + if _, err := os.Stat(filename); err != nil { + break + } + i++ + } + + // Write memory profiling data. + mprof, merr := os.Create(filename) + if merr != nil { + return errors.Wrap(merr, "failed to create file for memory profiler output") + } + defer mprof.Close() + runtime.GC() + if merr = runtimePprof.WriteHeapProfile(mprof); merr != nil { + return errors.Wrap(merr, "failed to write heap profile") + } + + return nil +} + func (p *pprofService) Start() error { if p.address == "" { return nil diff --git a/go/oasis-node/cmd/storage/benchmark/benchmark.go b/go/oasis-node/cmd/storage/benchmark/benchmark.go index 64782117760..5ec9a8bd4c2 100644 --- a/go/oasis-node/cmd/storage/benchmark/benchmark.go +++ b/go/oasis-node/cmd/storage/benchmark/benchmark.go @@ -256,7 +256,8 @@ func doBenchmark(cmd *cobra.Command, args []string) { // nolint: gocyclo []byte("Excepting thee, of all these hosts of hostile chiefs arrayed,"), []byte("There shines not one shall leave alive the battlefield!"), } - expectedNewRoot := [...]byte{82, 3, 202, 16, 125, 182, 175, 25, 51, 188, 131, 181, 118, 76, 249, 15, 53, 89, 59, 224, 95, 75, 239, 182, 157, 30, 80, 48, 237, 108, 90, 22} + var expectedNewRoot hash.Hash + _ = expectedNewRoot.UnmarshalHex("131859d5048d5b11677ffed800b0329962960efae70b4def7023c380c2f075ee") var emptyRoot hash.Hash emptyRoot.Empty() diff --git a/go/oasis-test-runner/cmd/root.go b/go/oasis-test-runner/cmd/root.go index b882088c130..35c68d52507 100644 --- a/go/oasis-test-runner/cmd/root.go +++ b/go/oasis-test-runner/cmd/root.go @@ -7,9 +7,14 @@ import ( "os" "path/filepath" "sort" + "strconv" "strings" + "sync" + "time" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/push" "github.com/spf13/cobra" flag "github.com/spf13/pflag" "github.com/spf13/viper" @@ -18,6 +23,7 @@ import ( "github.com/oasislabs/oasis-core/go/common/version" "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common" cmdFlags "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/flags" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/metrics" "github.com/oasislabs/oasis-core/go/oasis-test-runner/env" "github.com/oasislabs/oasis-core/go/oasis-test-runner/oasis" "github.com/oasislabs/oasis-core/go/oasis-test-runner/scenario" @@ -28,7 +34,9 @@ const ( cfgLogFmt = "log.format" cfgLogLevel = "log.level" cfgLogNoStdout = "log.no_stdout" - cfgTest = "test" + cfgNumRuns = "num_runs" + CfgTest = "test" + CfgTestP = "t" cfgParallelJobCount = "parallel.job_count" cfgParallelJobIndex = "parallel.job_index" ) @@ -50,10 +58,26 @@ var ( rootFlags = flag.NewFlagSet("", flag.ContinueOnError) cfgFile string + numRuns int scenarioMap = make(map[string]scenario.Scenario) defaultScenarios []scenario.Scenario scenarios []scenario.Scenario + + // oasis-test-runner-specific metrics. + upGauge = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: "up", + Help: "Is oasis-test-runner test active", + }, + ) + + oasisTestRunnerCollectors = []prometheus.Collector{ + upGauge, + } + + pusher *push.Pusher + oasisTestRunnerOnce sync.Once ) // RootCmd returns the root command's structure that will be executed, so that @@ -73,17 +97,125 @@ func Execute() { } // RegisterNondefault adds a scenario to the runner. -func RegisterNondefault(scenario scenario.Scenario) error { - n := strings.ToLower(scenario.Name()) +func RegisterNondefault(s scenario.Scenario) error { + n := strings.ToLower(s.Name()) if _, ok := scenarioMap[n]; ok { return errors.New("root: scenario already registered: " + n) } - scenarioMap[n] = scenario - scenarios = append(scenarios, scenario) + scenarioMap[n] = s + scenarios = append(scenarios, s) + + params := s.Parameters() + if len(params) > 0 { + for k, v := range scenario.ParametersToStringMap(params) { + // Re-register rootFlags for test parameters. + rootFlags.StringSlice("params."+n+"."+k, []string{v}, "") + rootCmd.PersistentFlags().AddFlagSet(rootFlags) + _ = viper.BindPFlag("params."+n+"."+k, rootFlags.Lookup("params."+n+"."+k)) + } + } + return nil } +// parseTestParams parses --params..=,... flags combinations, clones provided proto- +// scenarios, and populates them so that each scenario instance has unique paramater set. Returns mapping test name -> +// list of scenario instances. +func parseTestParams(toRun []scenario.Scenario) (map[string][]scenario.Scenario, error) { + r := make(map[string][]scenario.Scenario) + for _, s := range toRun { + zippedParams := make(map[string][]string) + for k := range s.Parameters() { + userVal := viper.GetStringSlice("params." + s.Name() + "." + k) + if userVal == nil { + continue + } + zippedParams[k] = userVal + } + + parameterSets := computeParamSets(zippedParams, map[string]string{}) + + // For each parameter set combination, clone a scenario and apply user-provided parameter value. + for _, ps := range parameterSets { + sCloned := s.Clone() + for k, userVal := range ps { + v := sCloned.Parameters()[k] + switch v := v.(type) { + case *int: + val, err := strconv.ParseInt(userVal, 10, 32) + if err != nil { + return nil, err + } + *v = int(val) + case *int64: + val, err := strconv.ParseInt(userVal, 10, 64) + if err != nil { + return nil, err + } + *v = val + case *float64: + val, err := strconv.ParseFloat(userVal, 64) + if err != nil { + return nil, err + } + *v = val + case *bool: + val, err := strconv.ParseBool(userVal) + if err != nil { + return nil, err + } + *v = val + case *string: + *v = userVal + default: + return nil, errors.New(fmt.Sprintf("cannot parse parameter. Unknown type %v", v)) + } + } + r[s.Name()] = append(r[s.Name()], sCloned) + } + + // No parameters provided over CLI, keep a single copy. + if len(parameterSets) == 0 { + r[s.Name()] = []scenario.Scenario{s} + } + } + + return r, nil +} + +// computeParamSets recursively combines a map of string slices into all possible key=>value parameter sets. +func computeParamSets(zp map[string][]string, ps map[string]string) []map[string]string { + // Recursion stops when zp is empty. + if len(zp) == 0 { + // XXX: How do I clone a map in golang? + psCloned := map[string]string{} + for k, v := range ps { + psCloned[k] = v + } + return []map[string]string{psCloned} + } + + rps := []map[string]string{} + + // XXX: How do I clone a map in golang? + zpCloned := map[string][]string{} + for k, v := range zp { + zpCloned[k] = v + } + // Take first element from cloned zp and do recursion. + for k, vals := range zpCloned { + delete(zpCloned, k) + for _, v := range vals { + ps[k] = v + rps = append(rps, computeParamSets(zpCloned, ps)...) + } + break + } + + return rps +} + // Register adds a scenario to the runner and the default scenarios list. func Register(scenario scenario.Scenario) error { if err := RegisterNondefault(scenario); err != nil { @@ -141,6 +273,12 @@ func initRootEnv(cmd *cobra.Command) (*env.Env, error) { func runRoot(cmd *cobra.Command, args []string) error { cmd.SilenceUsage = true + if viper.GetString(metrics.CfgMetricsAddr) != "" { + oasisTestRunnerOnce.Do(func() { + prometheus.MustRegister(oasisTestRunnerCollectors...) + }) + } + // Initialize the base dir, logging, etc. rootEnv, err := initRootEnv(cmd) if err != nil { @@ -151,7 +289,7 @@ func runRoot(cmd *cobra.Command, args []string) error { // Enumerate the requested test cases. toRun := defaultScenarios // Run all default scenarios if not set. - if vec := viper.GetStringSlice(cfgTest); len(vec) > 0 { + if vec := viper.GetStringSlice(CfgTest); len(vec) > 0 { toRun = nil for _, v := range vec { n := strings.ToLower(v) @@ -177,61 +315,101 @@ func runRoot(cmd *cobra.Command, args []string) error { parallelJobCount := viper.GetInt(cfgParallelJobCount) parallelJobIndex := viper.GetInt(cfgParallelJobIndex) - for index, v := range toRun { - n := v.Name() - - if index%parallelJobCount != parallelJobIndex { - logger.Info("skipping test case (assigned to different parallel job)", - "test", n, - ) - continue - } - - if excludeMap[strings.ToLower(n)] { - logger.Info("skipping test case (excluded by environment)", - "test", n, - ) - continue - } + // Parse test parameters passed by CLI. + var toRunExploded map[string][]scenario.Scenario + toRunExploded, err = parseTestParams(toRun) + if err != nil { + return errors.Wrap(err, "root: failed to parse test params") + } - logger.Info("running test case", - "test", n, - ) - - childEnv, err := rootEnv.NewChild(n) - if err != nil { - logger.Error("failed to setup child environment", - "err", err, - "test", n, - ) - return errors.Wrap(err, "root: failed to setup child environment") - } + // Run all test instances. + index := 0 + for run := 0; run < numRuns; run++ { + for name, sc := range toRunExploded { + for i, v := range sc { + // Maintain unique scenario datadir. + n := fmt.Sprintf("%s/%d", name, run*len(sc)+i) + + if index%parallelJobCount != parallelJobIndex { + logger.Info("skipping test case (assigned to different parallel job)", + "test", n, + ) + index++ + continue + } + + if excludeMap[strings.ToLower(v.Name())] { + logger.Info("skipping test case (excluded by environment)", + "test", n, + ) + index++ + continue + } + + logger.Info("running test case", + "test", n, + ) - if err = doScenario(childEnv, v); err != nil { - logger.Error("failed to run test case", - "err", err, - "test", n, - ) - err = errors.Wrap(err, "root: failed to run test case") - } + childEnv, err := rootEnv.NewChild(n, env.TestInstanceInfo{ + Test: v.Name(), + Instance: filepath.Base(rootEnv.Dir()), + ParameterSet: scenario.ParametersToStringMap(v.Parameters()), + Run: run, + }) + if err != nil { + logger.Error("failed to setup child environment", + "err", err, + "test", n, + ) + return errors.Wrap(err, "root: failed to setup child environment") + } + + // Dump current parameter set to file. + if err = childEnv.WriteTestInstanceInfo(); err != nil { + return err + } + + // Init per-run prometheus pusher, if metrics are enabled. + if viper.GetString(metrics.CfgMetricsAddr) != "" { + pusher = push.New(viper.GetString(metrics.CfgMetricsAddr), oasis.MetricsJobName) + pusher = pusher. + Grouping("instance", childEnv.TestInfo().Instance). + Grouping("run", strconv.Itoa(childEnv.TestInfo().Run)). + Grouping("test", childEnv.TestInfo().Test). + Grouping("software_version", version.SoftwareVersion). + Grouping("git_branch", version.GitBranch). + Gatherer(prometheus.DefaultGatherer) + } + + if err = doScenario(childEnv, v); err != nil { + logger.Error("failed to run test case", + "err", err, + "test", n, + ) + err = errors.Wrap(err, "root: failed to run test case") + } + + if cleanErr := doCleanup(childEnv); cleanErr != nil { + logger.Error("failed to clean up child envionment", + "err", cleanErr, + "test", n, + ) + if err == nil { + err = errors.Wrap(cleanErr, "root: failed to clean up child enviroment") + } + } + + if err != nil { + return err + } + + logger.Info("passed test case", + "test", n, + ) - if cleanErr := doCleanup(childEnv); cleanErr != nil { - logger.Error("failed to clean up child envionment", - "err", cleanErr, - "test", n, - ) - if err == nil { - err = errors.Wrap(cleanErr, "root: failed to clean up child enviroment") + index++ } } - - if err != nil { - return err - } - - logger.Info("passed test case", - "test", n, - ) } return nil @@ -265,11 +443,27 @@ func doScenario(childEnv *env.Env, scenario scenario.Scenario) (err error) { return } + if pusher != nil { + upGauge.Set(1.0) + if err = pusher.Push(); err != nil { + err = errors.Wrap(err, "root: failed to push metrics") + return + } + } + if err = scenario.Run(childEnv); err != nil { err = errors.Wrap(err, "root: failed to run test case") return } + if pusher != nil { + upGauge.Set(0.0) + if err = pusher.Push(); err != nil { + err = errors.Wrap(err, "root: failed to push metrics") + return + } + } + return } @@ -298,7 +492,16 @@ func runList(cmd *cobra.Command, args []string) { }) for _, v := range scenarios { - fmt.Printf(" * %v\n", v.Name()) + fmt.Printf(" * %v", v.Name()) + params := v.Parameters() + if len(params) > 0 { + fmt.Printf(" (parameters:") + for p := range params { + fmt.Printf(" %v", p) + } + fmt.Printf(")") + } + fmt.Printf("\n") } } } @@ -312,7 +515,10 @@ func init() { rootFlags.Var(&logFmt, cfgLogFmt, "log format") rootFlags.Var(&logLevel, cfgLogLevel, "log level") rootFlags.Bool(cfgLogNoStdout, false, "do not mutiplex logs to stdout") - rootFlags.StringSliceP(cfgTest, "t", nil, "test(s) to run") + rootFlags.StringSliceP(CfgTest, CfgTestP, nil, "test(s) to run") + rootFlags.String(metrics.CfgMetricsAddr, "", "metrics (prometheus) pushgateway address") + rootFlags.Duration(metrics.CfgMetricsPushInterval, 5*time.Second, "push interval for node exporter and oasis nodes") + rootFlags.IntVarP(&numRuns, cfgNumRuns, "n", 1, "number of runs for given test(s)") rootFlags.Int(cfgParallelJobCount, 1, "(for CI) number of overall parallel jobs") rootFlags.Int(cfgParallelJobIndex, 0, "(for CI) index of this parallel job") _ = viper.BindPFlags(rootFlags) diff --git a/go/oasis-test-runner/cmd/root_test.go b/go/oasis-test-runner/cmd/root_test.go new file mode 100644 index 00000000000..48c6fd7bd21 --- /dev/null +++ b/go/oasis-test-runner/cmd/root_test.go @@ -0,0 +1,25 @@ +package cmd + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestComputeParamSets(t *testing.T) { + zippedParams := map[string][]string{ + "testParam1": []string{"1", "2", "3"}, + "testParam2": []string{"a", "b"}, + } + + expectedParamSets := []map[string]string{ + {"testParam1": "1", "testParam2": "a"}, + {"testParam1": "1", "testParam2": "b"}, + {"testParam1": "2", "testParam2": "a"}, + {"testParam1": "2", "testParam2": "b"}, + {"testParam1": "3", "testParam2": "a"}, + {"testParam1": "3", "testParam2": "b"}, + } + + require.Equal(t, expectedParamSets, computeParamSets(zippedParams, map[string]string{})) +} diff --git a/go/oasis-test-runner/env/env.go b/go/oasis-test-runner/env/env.go index 7bc0d38a8d8..937a3c68988 100644 --- a/go/oasis-test-runner/env/env.go +++ b/go/oasis-test-runner/env/env.go @@ -3,7 +3,9 @@ package env import ( "container/list" + "encoding/json" "errors" + "io/ioutil" "os/exec" "sync" "syscall" @@ -17,6 +19,21 @@ var ErrEarlyTerm = errors.New("env: sub-process exited early") // CleanupFn is the cleanup hook function prototype. type CleanupFn func() +// TestInstanceInfo contains information of the current test run. +type TestInstanceInfo struct { + // Test is the name of the test. + Test string `json:"test"` + + // Instance is the instance name of the test. e.g. oasis-test-runner123456 + Instance string `json:"instance"` + + // ParameterSet is the paramater set the test was run with. + ParameterSet map[string]string `json:"parameter_set"` + + // Run is the number of run. + Run int `json:"run"` +} + // Env is a (nested) test environment. type Env struct { name string @@ -26,6 +43,7 @@ type Env struct { children *list.List dir *Dir + testInfo TestInstanceInfo cleanupFns []CleanupFn cleanupCmds []*cmdMonitor cleanupLock sync.Mutex @@ -53,6 +71,11 @@ func (env *Env) NewSubDir(subDirName string) (*Dir, error) { return env.dir.NewSubDir(subDirName) } +// TestInfo returns the test instance information. +func (env *Env) TestInfo() TestInstanceInfo { + return env.testInfo +} + // AddOnCleanup adds a cleanup routine to be called durring the environment's // cleanup. Routines will be called in reverse order that they were // registered. @@ -130,7 +153,7 @@ func (env *Env) Cleanup() { } // NewChild returns a new child test environment. -func (env *Env) NewChild(childName string) (*Env, error) { +func (env *Env) NewChild(childName string, testInfo TestInstanceInfo) (*Env, error) { var parentDir *Dir if env.parent != nil { parentDir = env.parent.dir @@ -148,12 +171,26 @@ func (env *Env) NewChild(childName string) (*Env, error) { parent: env, children: list.New(), dir: subDir, + testInfo: testInfo, } child.parentElem = env.children.PushBack(child) return child, nil } +// WriteParamSetToFile dumps test instance parameter set to test_instance_info.json file for debugging afterwards. +func (env *Env) WriteTestInstanceInfo() error { + b, err := json.Marshal(env.testInfo) + if err != nil { + return err + } + if err = ioutil.WriteFile(env.Dir()+"/test_instance_info.json", b, 0644); err != nil { + return err + } + + return nil +} + // New creates a new root test environment. func New(dir *Dir) *Env { return &Env{ diff --git a/go/oasis-test-runner/oasis/args.go b/go/oasis-test-runner/oasis/args.go index c806a416900..94fd8810bad 100644 --- a/go/oasis-test-runner/oasis/args.go +++ b/go/oasis-test-runner/oasis/args.go @@ -5,18 +5,23 @@ import ( "fmt" "path/filepath" "strconv" + "strings" "time" + "github.com/spf13/viper" + "github.com/oasislabs/oasis-core/go/common" commonGrpc "github.com/oasislabs/oasis-core/go/common/grpc" "github.com/oasislabs/oasis-core/go/common/node" "github.com/oasislabs/oasis-core/go/common/sgx" + "github.com/oasislabs/oasis-core/go/common/version" "github.com/oasislabs/oasis-core/go/consensus/tendermint" epochtime "github.com/oasislabs/oasis-core/go/epochtime/api" "github.com/oasislabs/oasis-core/go/ias" cmdCommon "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common" "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/flags" "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/grpc" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/metrics" "github.com/oasislabs/oasis-core/go/oasis-node/cmd/debug/byzantine" "github.com/oasislabs/oasis-core/go/oasis-node/cmd/debug/supplementarysanity" runtimeRegistry "github.com/oasislabs/oasis-core/go/runtime/registry" @@ -32,6 +37,17 @@ import ( workerStorage "github.com/oasislabs/oasis-core/go/worker/storage" ) +const ( + MetricsJobName = "oasis-test-runner" + + MetricsLabelGitBranch = "git_branch" + MetricsLabelInstance = "instance" + MetricsLabelRun = "run" + MetricsLabelSoftwareVersion = "software_version" + MetricsLabelTest = "test" + MetricsLabelTEEHardware = "runtime_tee_hardware" +) + type argBuilder struct { vec []string } @@ -403,6 +419,38 @@ func (args *argBuilder) appendSeedNodes(net *Network) *argBuilder { return args } +func (args *argBuilder) appendNodeMetrics(node *Node) *argBuilder { + args.vec = append(args.vec, []string{ + "--" + metrics.CfgMetricsMode, metrics.MetricsModePush, + "--" + metrics.CfgMetricsAddr, viper.GetString(metrics.CfgMetricsAddr), + "--" + metrics.CfgMetricsPushInterval, viper.GetString(metrics.CfgMetricsPushInterval), + "--" + metrics.CfgMetricsPushJobName, node.Name}...) + + // Append labels. + args.vec = append(args.vec, "--"+metrics.CfgMetricsPushLabels) + ti := node.net.env.TestInfo() + l := []string{MetricsLabelInstance + "=" + ti.Instance, + MetricsLabelRun + "=" + strconv.Itoa(ti.Run), + MetricsLabelTest + "=" + ti.Test, + MetricsLabelSoftwareVersion + "=" + version.SoftwareVersion, + } + if version.GitBranch != "" { + l = append(l, MetricsLabelGitBranch+"="+version.GitBranch) + } + // Populate it with test-provided parameters. + for k, v := range ti.ParameterSet { + l = append(l, metrics.EscapeLabelCharacters(k)+"="+v) + } + // Populate it with TEE hardware info. + if len(node.net.runtimes) > 0 { + l = append(l, MetricsLabelTEEHardware+"="+node.net.runtimes[0].teeHardware.String()) + } + + args.vec = append(args.vec, strings.Join(l, ",")) + + return args +} + func (args *argBuilder) appendNetwork(net *Network) *argBuilder { args = args.grpcLogDebug() return args diff --git a/go/oasis-test-runner/oasis/oasis.go b/go/oasis-test-runner/oasis/oasis.go index 196ce890188..ff3ea01ce04 100644 --- a/go/oasis-test-runner/oasis/oasis.go +++ b/go/oasis-test-runner/oasis/oasis.go @@ -15,6 +15,7 @@ import ( "time" "github.com/pkg/errors" + "github.com/spf13/viper" "github.com/oasislabs/oasis-core/go/common/crypto/drbg" "github.com/oasislabs/oasis-core/go/common/crypto/signature" @@ -25,6 +26,7 @@ import ( genesisFile "github.com/oasislabs/oasis-core/go/genesis/file" genesisTestHelpers "github.com/oasislabs/oasis-core/go/genesis/tests" "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/metrics" "github.com/oasislabs/oasis-core/go/oasis-node/cmd/genesis" "github.com/oasislabs/oasis-core/go/oasis-test-runner/env" "github.com/oasislabs/oasis-core/go/oasis-test-runner/log" @@ -615,6 +617,9 @@ func (net *Network) startOasisNode( tendermintDebugAddrBookLenient(). tendermintDebugAllowDuplicateIP() } + if viper.GetString(metrics.CfgMetricsAddr) != "" { + extraArgs = extraArgs.appendNodeMetrics(node) + } args := append([]string{}, subCmd...) args = append(args, baseArgs...) args = append(args, extraArgs.vec...) diff --git a/go/oasis-test-runner/scenario/e2e/basic.go b/go/oasis-test-runner/scenario/e2e/basic.go index c4efde496c6..86b99f195ec 100644 --- a/go/oasis-test-runner/scenario/e2e/basic.go +++ b/go/oasis-test-runner/scenario/e2e/basic.go @@ -62,10 +62,23 @@ type basicImpl struct { logger *logging.Logger } +func (sc *basicImpl) Clone() scenario.Scenario { + return &basicImpl{ + name: sc.name, + clientBinary: sc.clientBinary, + clientArgs: sc.clientArgs, + logger: logging.GetLogger("scenario/e2e/" + sc.name), + } +} + func (sc *basicImpl) Name() string { return sc.name } +func (sc *basicImpl) Parameters() map[string]interface{} { + return NoParameters +} + func (sc *basicImpl) Fixture() (*oasis.NetworkFixture, error) { var tee node.TEEHardware err := tee.FromString(viper.GetString(cfgTEEHardware)) diff --git a/go/oasis-test-runner/scenario/e2e/byzantine.go b/go/oasis-test-runner/scenario/e2e/byzantine.go index 873f24f4e36..0d0193bd9d7 100644 --- a/go/oasis-test-runner/scenario/e2e/byzantine.go +++ b/go/oasis-test-runner/scenario/e2e/byzantine.go @@ -76,6 +76,15 @@ func newByzantineImpl(script string, logWatcherHandlerFactories []log.WatcherHan } } +func (sc *byzantineImpl) Clone() scenario.Scenario { + return &byzantineImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + script: sc.script, + identitySeed: sc.identitySeed, + logWatcherHandlerFactories: sc.logWatcherHandlerFactories, + } +} + func (sc *byzantineImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := sc.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/common.go b/go/oasis-test-runner/scenario/e2e/common.go index 89bc6b41651..65874f78067 100644 --- a/go/oasis-test-runner/scenario/e2e/common.go +++ b/go/oasis-test-runner/scenario/e2e/common.go @@ -32,6 +32,9 @@ var ( // Flags is the command line flags for the e2e tests. Flags = flag.NewFlagSet("", flag.ContinueOnError) + // NoParameters is used when no parameters can be set for the scenario using CLI. + NoParameters = make(map[string]interface{}) + runtimeID common.Namespace keymanagerID common.Namespace diff --git a/go/oasis-test-runner/scenario/e2e/debond.go b/go/oasis-test-runner/scenario/e2e/debond.go index 1dc2582379c..6309106ac40 100644 --- a/go/oasis-test-runner/scenario/e2e/debond.go +++ b/go/oasis-test-runner/scenario/e2e/debond.go @@ -21,6 +21,12 @@ type debondImpl struct { basicImpl } +func (s *debondImpl) Clone() scenario.Scenario { + return &debondImpl{ + basicImpl: *s.basicImpl.Clone().(*basicImpl), + } +} + func (s *debondImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := s.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/dump_restore.go b/go/oasis-test-runner/scenario/e2e/dump_restore.go index ca2c58dae6b..fd5d3651e42 100644 --- a/go/oasis-test-runner/scenario/e2e/dump_restore.go +++ b/go/oasis-test-runner/scenario/e2e/dump_restore.go @@ -29,6 +29,12 @@ func newDumpRestoreImpl() scenario.Scenario { return sc } +func (sc *dumpRestoreImpl) Clone() scenario.Scenario { + return &dumpRestoreImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + } +} + func (sc *dumpRestoreImpl) Run(childEnv *env.Env) error { clientErrCh, cmd, err := sc.basicImpl.start(childEnv) if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/gas_fees_runtimes.go b/go/oasis-test-runner/scenario/e2e/gas_fees_runtimes.go index 244a606c65f..6301fc5266a 100644 --- a/go/oasis-test-runner/scenario/e2e/gas_fees_runtimes.go +++ b/go/oasis-test-runner/scenario/e2e/gas_fees_runtimes.go @@ -22,6 +22,12 @@ type gasFeesRuntimesImpl struct { basicImpl } +func (sc *gasFeesRuntimesImpl) Clone() scenario.Scenario { + return &gasFeesRuntimesImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + } +} + func (sc *gasFeesRuntimesImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := sc.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/gas_fees_staking.go b/go/oasis-test-runner/scenario/e2e/gas_fees_staking.go index e707d34dea5..b6c82274710 100644 --- a/go/oasis-test-runner/scenario/e2e/gas_fees_staking.go +++ b/go/oasis-test-runner/scenario/e2e/gas_fees_staking.go @@ -40,10 +40,20 @@ type gasFeesImpl struct { logger *logging.Logger } +func (sc *gasFeesImpl) Clone() scenario.Scenario { + return &gasFeesImpl{ + logger: logging.GetLogger("scenario/e2e/gas-fees/staking"), + } +} + func (sc *gasFeesImpl) Name() string { return "gas-fees/staking" } +func (sc *gasFeesImpl) Parameters() map[string]interface{} { + return NoParameters +} + func (sc *gasFeesImpl) Fixture() (*oasis.NetworkFixture, error) { var tee node.TEEHardware err := tee.FromString(viper.GetString(cfgTEEHardware)) diff --git a/go/oasis-test-runner/scenario/e2e/halt_restore.go b/go/oasis-test-runner/scenario/e2e/halt_restore.go index f81a4bd4f53..087ac41e713 100644 --- a/go/oasis-test-runner/scenario/e2e/halt_restore.go +++ b/go/oasis-test-runner/scenario/e2e/halt_restore.go @@ -40,6 +40,12 @@ func newHaltRestoreImpl() scenario.Scenario { } } +func (sc *haltRestoreImpl) Clone() scenario.Scenario { + return &haltRestoreImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + } +} + func (sc *haltRestoreImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := sc.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/identity_cli.go b/go/oasis-test-runner/scenario/e2e/identity_cli.go index a74f407b994..70dcbbd0da9 100644 --- a/go/oasis-test-runner/scenario/e2e/identity_cli.go +++ b/go/oasis-test-runner/scenario/e2e/identity_cli.go @@ -30,10 +30,22 @@ type identityCLIImpl struct { logger *logging.Logger } +func (ident *identityCLIImpl) Clone() scenario.Scenario { + return &identityCLIImpl{ + nodeBinary: ident.nodeBinary, + dataDir: ident.dataDir, + logger: logging.GetLogger("scenario/e2e/identity-cli"), + } +} + func (ident *identityCLIImpl) Name() string { return "identity-cli" } +func (ident *identityCLIImpl) Parameters() map[string]interface{} { + return NoParameters +} + func (ident *identityCLIImpl) Init(childEnv *env.Env, net *oasis.Network) error { ident.nodeBinary = viper.GetString(cfgNodeBinary) diff --git a/go/oasis-test-runner/scenario/e2e/keymanager_restart.go b/go/oasis-test-runner/scenario/e2e/keymanager_restart.go index 6e9443f8e8e..11c8b187a2f 100644 --- a/go/oasis-test-runner/scenario/e2e/keymanager_restart.go +++ b/go/oasis-test-runner/scenario/e2e/keymanager_restart.go @@ -27,6 +27,12 @@ func newKmRestartImpl() scenario.Scenario { } } +func (sc *kmRestartImpl) Clone() scenario.Scenario { + return &kmRestartImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + } +} + func (sc *kmRestartImpl) Run(childEnv *env.Env) error { clientErrCh, cmd, err := sc.basicImpl.start(childEnv) if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/multiple_runtimes.go b/go/oasis-test-runner/scenario/e2e/multiple_runtimes.go index 833baab0f7b..32c1b0c9225 100644 --- a/go/oasis-test-runner/scenario/e2e/multiple_runtimes.go +++ b/go/oasis-test-runner/scenario/e2e/multiple_runtimes.go @@ -3,33 +3,29 @@ package e2e import ( "context" "fmt" + "strconv" "time" "github.com/oasislabs/oasis-core/go/common" "github.com/oasislabs/oasis-core/go/common/logging" epochtime "github.com/oasislabs/oasis-core/go/epochtime/api" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/pprof" "github.com/oasislabs/oasis-core/go/oasis-test-runner/env" "github.com/oasislabs/oasis-core/go/oasis-test-runner/oasis" "github.com/oasislabs/oasis-core/go/oasis-test-runner/scenario" registry "github.com/oasislabs/oasis-core/go/registry/api" ) -const ( - // numComputeRuntimes is the number of runtimes, all with common runtimeBinary registered. - numComputeRuntimes = 2 - - // numComputeRuntimeTxns is the number of insert test transactions sent to each runtime. - numComputeRuntimeTxns = 2 - - // numComputeWorkers is the number of compute workers. - numComputeWorkers = 1 -) - var ( // MultipleRuntimes is a scenario which tests running multiple runtimes on one node. MultipleRuntimes scenario.Scenario = &multipleRuntimesImpl{ basicImpl: *newBasicImpl("multiple-runtimes", "simple-keyvalue-client", nil), logger: logging.GetLogger("scenario/e2e/multiple_runtimes"), + + numComputeRuntimes: 2, + numComputeRuntimeTxns: 2, + numComputeWorkers: 2, + executorGroupSize: 2, } ) @@ -37,13 +33,47 @@ type multipleRuntimesImpl struct { basicImpl logger *logging.Logger + + // numComputeRuntimes is the number of runtimes, all with common runtimeBinary registered. + numComputeRuntimes int + + // numComputeRuntimeTxns is the number of insert test transactions sent to each runtime. + numComputeRuntimeTxns int + + // numComputeWorkers is the number of compute workers. + numComputeWorkers int + + // executorGroupSize is the number of executor nodes. + executorGroupSize int +} + +func (mr *multipleRuntimesImpl) Clone() scenario.Scenario { + return &multipleRuntimesImpl{ + basicImpl: *mr.basicImpl.Clone().(*basicImpl), + logger: logging.GetLogger("scenario/e2e/multiple_runtimes"), + + numComputeRuntimes: mr.numComputeRuntimes, + numComputeRuntimeTxns: mr.numComputeRuntimeTxns, + numComputeWorkers: mr.numComputeWorkers, + executorGroupSize: mr.executorGroupSize, + } } func (mr *multipleRuntimesImpl) Name() string { return "multiple-runtimes" } +func (mr *multipleRuntimesImpl) Parameters() map[string]interface{} { + return map[string]interface{}{ + "num_compute_runtimes": &mr.numComputeRuntimes, + "num_compute_runtime_txns": &mr.numComputeRuntimeTxns, + "num_compute_workers": &mr.numComputeWorkers, + "executor_group_size": &mr.executorGroupSize, + } +} + func (mr *multipleRuntimesImpl) Fixture() (*oasis.NetworkFixture, error) { + // Take default fixtures from Basic test. f, err := mr.basicImpl.Fixture() if err != nil { return nil, err @@ -70,7 +100,7 @@ func (mr *multipleRuntimesImpl) Fixture() (*oasis.NetworkFixture, error) { f.Network.EpochtimeMock = true // Add some more consecutive runtime IDs with the same binary. - for i := 1; i <= numComputeRuntimes; i++ { + for i := 1; i <= mr.numComputeRuntimes; i++ { // Increase LSB by 1. id[len(id)-1]++ @@ -81,7 +111,7 @@ func (mr *multipleRuntimesImpl) Fixture() (*oasis.NetworkFixture, error) { Keymanager: 0, Binary: runtimeBinary, Executor: registry.ExecutorParameters{ - GroupSize: 1, + GroupSize: uint64(mr.executorGroupSize), GroupBackupSize: 0, RoundTimeout: 10 * time.Second, }, @@ -114,7 +144,7 @@ func (mr *multipleRuntimesImpl) Fixture() (*oasis.NetworkFixture, error) { // Use numComputeWorkers compute worker fixtures. f.ComputeWorkers = []oasis.ComputeWorkerFixture{} - for i := 0; i < numComputeWorkers; i++ { + for i := 0; i < mr.numComputeWorkers; i++ { f.ComputeWorkers = append(f.ComputeWorkers, oasis.ComputeWorkerFixture{Entity: 1}) } @@ -138,16 +168,20 @@ func (mr *multipleRuntimesImpl) Run(childEnv *env.Env) error { for _, r := range mr.net.Runtimes() { rt := r.ToRuntimeDescriptor() if rt.Kind == registry.KindCompute { - for i := 0; i < numComputeRuntimeTxns; i++ { + for i := 0; i < mr.numComputeRuntimeTxns; i++ { mr.logger.Info("submitting transaction to runtime", "seq", i, "runtime_id", rt.ID, ) - if err := mr.submitRuntimeTx(ctx, rt.ID, "hello", fmt.Sprintf("world %d from %s", i, rt.ID)); err != nil { + _ = pprof.WriteHeap("multiple-runtimes.runtime_" + strconv.Itoa(i) + ".beforeSubmitRuntimeTx") + + if err := mr.submitRuntimeTx(ctx, rt.ID, "hello", fmt.Sprintf("world at iteration %d from %s", i, rt.ID)); err != nil { return err } + _ = pprof.WriteHeap("multiple-runtimes.runtime_" + strconv.Itoa(i) + ".afterSubmitRuntimeTx") + mr.logger.Info("triggering epoch transition", "epoch", epoch, ) diff --git a/go/oasis-test-runner/scenario/e2e/node_shutdown.go b/go/oasis-test-runner/scenario/e2e/node_shutdown.go index 8ad10612a03..d300d946437 100644 --- a/go/oasis-test-runner/scenario/e2e/node_shutdown.go +++ b/go/oasis-test-runner/scenario/e2e/node_shutdown.go @@ -27,6 +27,12 @@ func newNodeShutdownImpl() scenario.Scenario { return sc } +func (sc *nodeShutdownImpl) Clone() scenario.Scenario { + return &nodeShutdownImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + } +} + func (sc *nodeShutdownImpl) Name() string { return "node-shutdown" } diff --git a/go/oasis-test-runner/scenario/e2e/registry_cli.go b/go/oasis-test-runner/scenario/e2e/registry_cli.go index bc4f7828cd8..6e36a0cd953 100644 --- a/go/oasis-test-runner/scenario/e2e/registry_cli.go +++ b/go/oasis-test-runner/scenario/e2e/registry_cli.go @@ -44,6 +44,12 @@ type registryCLIImpl struct { basicImpl } +func (r *registryCLIImpl) Clone() scenario.Scenario { + return ®istryCLIImpl{ + basicImpl: *r.basicImpl.Clone().(*basicImpl), + } +} + func (r *registryCLIImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := r.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/runtime_dynamic.go b/go/oasis-test-runner/scenario/e2e/runtime_dynamic.go index 54fd011de61..84c5ae82d06 100644 --- a/go/oasis-test-runner/scenario/e2e/runtime_dynamic.go +++ b/go/oasis-test-runner/scenario/e2e/runtime_dynamic.go @@ -36,6 +36,13 @@ func newRuntimeDynamicImpl() scenario.Scenario { } } +func (sc *runtimeDynamicImpl) Clone() scenario.Scenario { + return &runtimeDynamicImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + epoch: sc.epoch, + } +} + func (sc *runtimeDynamicImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := sc.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/runtime_prune.go b/go/oasis-test-runner/scenario/e2e/runtime_prune.go index cc08a13ed25..751e472b8a8 100644 --- a/go/oasis-test-runner/scenario/e2e/runtime_prune.go +++ b/go/oasis-test-runner/scenario/e2e/runtime_prune.go @@ -37,6 +37,12 @@ func newRuntimePruneImpl() scenario.Scenario { } } +func (sc *runtimePruneImpl) Clone() scenario.Scenario { + return &runtimePruneImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + } +} + func (sc *runtimePruneImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := sc.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/sentry.go b/go/oasis-test-runner/scenario/e2e/sentry.go index b203d9e0d37..76b25d92f5a 100644 --- a/go/oasis-test-runner/scenario/e2e/sentry.go +++ b/go/oasis-test-runner/scenario/e2e/sentry.go @@ -28,6 +28,12 @@ func newSentryImpl(name, clientBinary string, clientArgs []string) scenario.Scen } } +func (s *sentryImpl) Clone() scenario.Scenario { + return &sentryImpl{ + basicImpl: *s.basicImpl.Clone().(*basicImpl), + } +} + func (s *sentryImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := s.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/stake_cli.go b/go/oasis-test-runner/scenario/e2e/stake_cli.go index 088d3f11273..43fa251a9af 100644 --- a/go/oasis-test-runner/scenario/e2e/stake_cli.go +++ b/go/oasis-test-runner/scenario/e2e/stake_cli.go @@ -67,6 +67,12 @@ type stakeCLIImpl struct { basicImpl } +func (s *stakeCLIImpl) Clone() scenario.Scenario { + return &stakeCLIImpl{ + basicImpl: *s.basicImpl.Clone().(*basicImpl), + } +} + func (s *stakeCLIImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := s.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/storage_sync.go b/go/oasis-test-runner/scenario/e2e/storage_sync.go index d318643e80a..e2f1d4292e8 100644 --- a/go/oasis-test-runner/scenario/e2e/storage_sync.go +++ b/go/oasis-test-runner/scenario/e2e/storage_sync.go @@ -29,6 +29,12 @@ func newStorageSyncImpl() scenario.Scenario { } } +func (sc *storageSyncImpl) Clone() scenario.Scenario { + return &storageSyncImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + } +} + func (sc *storageSyncImpl) Fixture() (*oasis.NetworkFixture, error) { f, err := sc.basicImpl.Fixture() if err != nil { diff --git a/go/oasis-test-runner/scenario/e2e/txsource.go b/go/oasis-test-runner/scenario/e2e/txsource.go index 0785688b064..91697079ea0 100644 --- a/go/oasis-test-runner/scenario/e2e/txsource.go +++ b/go/oasis-test-runner/scenario/e2e/txsource.go @@ -267,6 +267,17 @@ func (sc *txSourceImpl) startWorkload(childEnv *env.Env, errCh chan error, name return nil } +func (sc *txSourceImpl) Clone() scenario.Scenario { + return &txSourceImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + workloads: sc.workloads, + timeLimit: sc.timeLimit, + nodeRestartInterval: sc.nodeRestartInterval, + livenessCheckInterval: sc.livenessCheckInterval, + rng: sc.rng, + } +} + func (sc *txSourceImpl) Run(childEnv *env.Env) error { if err := sc.net.Start(); err != nil { return fmt.Errorf("scenario net Start: %w", err) diff --git a/go/oasis-test-runner/scenario/e2e/upgrade.go b/go/oasis-test-runner/scenario/e2e/upgrade.go index 9e086d238f5..9e6321ce55b 100644 --- a/go/oasis-test-runner/scenario/e2e/upgrade.go +++ b/go/oasis-test-runner/scenario/e2e/upgrade.go @@ -124,6 +124,13 @@ func (sc *nodeUpgradeImpl) Name() string { return "node-upgrade" } +func (sc *nodeUpgradeImpl) Clone() scenario.Scenario { + return &nodeUpgradeImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + ctx: sc.ctx, + } +} + func (sc *nodeUpgradeImpl) Fixture() (*oasis.NetworkFixture, error) { var tee node.TEEHardware err := tee.FromString(viper.GetString(cfgTEEHardware)) diff --git a/go/oasis-test-runner/scenario/e2e/upgrade_cancel.go b/go/oasis-test-runner/scenario/e2e/upgrade_cancel.go index 42e226c3e48..4266f825234 100644 --- a/go/oasis-test-runner/scenario/e2e/upgrade_cancel.go +++ b/go/oasis-test-runner/scenario/e2e/upgrade_cancel.go @@ -55,6 +55,13 @@ func newNodeUpgradeCancelImpl() scenario.Scenario { return sc } +func (sc *nodeUpgradeCancelImpl) Clone() scenario.Scenario { + return &nodeUpgradeCancelImpl{ + basicImpl: *sc.basicImpl.Clone().(*basicImpl), + ctx: sc.ctx, + } +} + func (sc *nodeUpgradeCancelImpl) Name() string { return "node-upgrade-cancel" } diff --git a/go/oasis-test-runner/scenario/remotesigner/basic.go b/go/oasis-test-runner/scenario/remotesigner/basic.go index ac634060ab2..17c0d4ea117 100644 --- a/go/oasis-test-runner/scenario/remotesigner/basic.go +++ b/go/oasis-test-runner/scenario/remotesigner/basic.go @@ -5,9 +5,6 @@ import ( "path/filepath" "time" - flag "github.com/spf13/pflag" - "github.com/spf13/viper" - "github.com/oasislabs/oasis-core/go/common/crypto/signature" fileSigner "github.com/oasislabs/oasis-core/go/common/crypto/signature/signers/file" remoteSigner "github.com/oasislabs/oasis-core/go/common/crypto/signature/signers/remote" @@ -20,24 +17,34 @@ import ( "github.com/oasislabs/oasis-core/go/oasis-test-runner/scenario" ) -const cfgRemoteSignerBinary = "remote_signer.binary" - var ( // Basic is the basic test case. Basic scenario.Scenario = newBasicImpl() - - // Flags is the command line flags for the remote signer tests. - Flags = flag.NewFlagSet("", flag.ContinueOnError) ) func newBasicImpl() *basicImpl { return &basicImpl{ - logger: logging.GetLogger("remote-signer/basic"), + logger: logging.GetLogger("remote-signer/basic"), + serverBinary: "oasis-remote-signer", } } type basicImpl struct { logger *logging.Logger + + serverBinary string +} + +func (sc *basicImpl) Clone() scenario.Scenario { + return &basicImpl{ + logger: logging.GetLogger("remote-signer/basic"), + } +} + +func (sc *basicImpl) Parameters() map[string]interface{} { + return map[string]interface{}{ + "binary": &sc.serverBinary, + } } func (sc *basicImpl) Name() string { @@ -53,15 +60,13 @@ func (sc *basicImpl) Init(childEnv *env.Env, net *oasis.Network) error { } func (sc *basicImpl) Run(childEnv *env.Env) error { - serverBinary := viper.GetString(cfgRemoteSignerBinary) - // Provision the server keys. sc.logger.Info("provisioning the server keys") if err := cli.RunSubCommand( childEnv, sc.logger, "init", - serverBinary, + sc.serverBinary, []string{ "--" + cmdCommon.CfgDataDir, childEnv.Dir(), "init", @@ -87,7 +92,7 @@ func (sc *basicImpl) Run(childEnv *env.Env) error { childEnv, sc.logger, "init_client", - serverBinary, + sc.serverBinary, []string{ "--" + cmdCommon.CfgDataDir, childEnv.Dir(), "init_client", @@ -106,7 +111,7 @@ func (sc *basicImpl) Run(childEnv *env.Env) error { childEnv, sc.logger, "server", - serverBinary, + sc.serverBinary, []string{ "--" + cmdCommon.CfgDataDir, childEnv.Dir(), "--client.certificate", filepath.Join(childEnv.Dir(), "remote_signer_client_cert.pem"), @@ -189,8 +194,3 @@ func (sc *basicImpl) Run(childEnv *env.Env) error { return nil } - -func init() { - Flags.String(cfgRemoteSignerBinary, "oasis-remote-signer", "path to the remote-signer binary") - _ = viper.BindPFlags(Flags) -} diff --git a/go/oasis-test-runner/scenario/scenario.go b/go/oasis-test-runner/scenario/scenario.go index 16ec1a0149c..0d1b22b1880 100644 --- a/go/oasis-test-runner/scenario/scenario.go +++ b/go/oasis-test-runner/scenario/scenario.go @@ -2,18 +2,49 @@ package scenario import ( + "strconv" + "github.com/oasislabs/oasis-core/go/oasis-test-runner/env" "github.com/oasislabs/oasis-core/go/oasis-test-runner/oasis" ) +// ParametersToStringMap convert scenario-specific key->value parameter set to string->string map. +func ParametersToStringMap(p map[string]interface{}) map[string]string { + sMap := make(map[string]string) + for k, v := range p { + switch v := v.(type) { + case *int: + sMap[k] = strconv.Itoa(*v) + case *int64: + sMap[k] = strconv.FormatInt(*v, 10) + case *float64: + sMap[k] = strconv.FormatFloat(*v, 'E', -1, 64) + case *bool: + sMap[k] = strconv.FormatBool(*v) + default: + sMap[k] = *v.(*string) + } + } + return sMap +} + // Scenario is a test scenario identified by name. type Scenario interface { + // Clone returns a copy of this scenario instance to be run in parallel. + Clone() Scenario + // Name returns the name of the scenario. // // Note: The name is used when selecting which tests to run, and should // be something suitable for use as a command line argument. Name() string + // Parameters returns the settable test parameters via CLI. + // + // The returned map should contain parameter name -> reference to the + // variable the parameter value should be stored to. + Parameters() map[string]interface{} + // Fixture returns a network fixture to use for this scenario. // // It may return nil in case the scenario doesn't use a fixture and diff --git a/go/oasis-test-runner/test-runner.go b/go/oasis-test-runner/test-runner.go index 0929f0548ee..76079498e63 100644 --- a/go/oasis-test-runner/test-runner.go +++ b/go/oasis-test-runner/test-runner.go @@ -68,7 +68,6 @@ func main() { _ = cmd.Register(e2e.Debond) // Register the remote signer test cases. - rootCmd.Flags().AddFlagSet(remotesigner.Flags) _ = cmd.Register(remotesigner.Basic) // Execute the command, now that everything has been initialized. diff --git a/go/worker/common/committee/node.go b/go/worker/common/committee/node.go index de341ee3064..1c897d9d826 100644 --- a/go/worker/common/committee/node.go +++ b/go/worker/common/committee/node.go @@ -3,9 +3,8 @@ package committee import ( "context" "errors" - "sync" - "github.com/prometheus/client_golang/prometheus" + "sync" "github.com/oasislabs/oasis-core/go/common/identity" "github.com/oasislabs/oasis-core/go/common/logging" @@ -112,6 +111,7 @@ func (n *Node) Name() string { // Start starts the service. func (n *Node) Start() error { go n.worker() + return nil } diff --git a/go/worker/common/worker.go b/go/worker/common/worker.go index 3b0ae738412..1df12d489d4 100644 --- a/go/worker/common/worker.go +++ b/go/worker/common/worker.go @@ -2,7 +2,6 @@ package common import ( "fmt" - "github.com/oasislabs/oasis-core/go/common" "github.com/oasislabs/oasis-core/go/common/grpc" "github.com/oasislabs/oasis-core/go/common/grpc/policy" diff --git a/go/worker/compute/txnscheduler/service.go b/go/worker/compute/txnscheduler/service.go index 24dc4707381..38c9560ce8c 100644 --- a/go/worker/compute/txnscheduler/service.go +++ b/go/worker/compute/txnscheduler/service.go @@ -3,6 +3,7 @@ package txnscheduler import ( "context" + "github.com/oasislabs/oasis-core/go/oasis-node/cmd/common/pprof" "github.com/oasislabs/oasis-core/go/worker/compute/txnscheduler/api" ) @@ -15,9 +16,13 @@ func (w *Worker) SubmitTx(ctx context.Context, rq *api.SubmitTxRequest) (*api.Su return nil, api.ErrUnknownRuntime } + rtid, _ := rq.RuntimeID.MarshalText() + _ = pprof.WriteHeap("multiple-runtimes.worker_" + string(rtid) + ".beforeSubmitRuntimeTx") if err := runtime.QueueCall(ctx, rq.ExpectedEpochNumber, rq.Data); err != nil { return nil, err } + _ = pprof.WriteHeap("multiple-runtimes.worker_" + string(rtid) + ".afterSubmitRuntimeTx") + return &api.SubmitTxResponse{}, nil } diff --git a/parse_bench.py b/parse_bench.py new file mode 100755 index 00000000000..ac4bce05c31 --- /dev/null +++ b/parse_bench.py @@ -0,0 +1,40 @@ +#!/usr/bin/python3 + +import os, sys + +def parse_memory_from_sar(filename): + mem = [] + i=0 + for line in open(filename, 'r'): + i += 1 + # First three lines are a comment. + if i<=3: + continue + + vals = line.split() + if len(vals)>3: + mem.append(int(vals[3])) + + return mem + +if len(sys.argv) < 2: + print("Usage "+sys.argv[0]+" ") + exit(1) + +BENCH_DIR = sys.argv[1] + +for filename in os.listdir(BENCH_DIR): + chunks = filename.split('.') + + test_name = chunks[0] + param = dict() + for i in range(1,len(chunks)-1): + t, v = chunks[i].split('_') + param[t] = v + + bench_type = chunks[-1] + + if bench_type == "sar": + mem = parse_memory_from_sar(BENCH_DIR+"/"+filename) + if len(mem)>0: + print("test "+test_name+" with params "+str(param)+" max mem usage: "+str(int(sorted(mem)[-1])/(1024))+" MiB") diff --git a/scripts/prometheus-dev.py b/scripts/prometheus-dev.py index fee0629b61e..f8692ce5957 100755 --- a/scripts/prometheus-dev.py +++ b/scripts/prometheus-dev.py @@ -1,11 +1,11 @@ #!/usr/bin/env python """ Starts Prometheus docker container and setups docker networking so -Ekiden and Prometheus containers can be connected. +Oasis node and Prometheus containers can be connected. -Script assumes Ekiden container is allready running and by default will -attach to container named "ekiden-*****". Use "--ekiden-name" to overide -the default name. +Script assumes Oasis node container is already running and by default will +attach to container named "oasis-node-*****". Use "--oasis-node-name" to +override the default name. Usage: ./scripts/prometheus.py \ @@ -61,7 +61,7 @@ def create_network(name): def connect_container(network_name, container_name): - """ Connects container to netowork *network_name*. """ + """ Connects container to network *network_name*. """ container_id = get_container_id(container_name) # Check if container allready connected to the network: @@ -76,7 +76,7 @@ def connect_container(network_name, container_name): # Container not yet connected. subprocess.check_call( ['docker', 'network', 'connect', '--alias', - 'ekiden', network_name, container_id] + 'oasis-node', network_name, container_id] ) @@ -116,12 +116,12 @@ def run_pushgateway(container_name, network_name): required.add_argument('--config', type=str, required=True, help="Path to prometheus.yml config.") - optional.add_argument('--ekiden-name', type=str, default="ekiden-", - help="Running ekiden container name.") + optional.add_argument('--oasis-node-name', type=str, default="oasis-node-", + help="Running oasis node container name.") optional.add_argument('--name', type=str, default="prometheus", help="Prometheus container name. Default: prometheus") - optional.add_argument('--network-name', type=str, default="prometheus-ekiden", - help="Netowrk name for prometheus-ekiden container connection. Default: prometheus-ekiden") + optional.add_argument('--network-name', type=str, default="prometheus-oasis-node", + help="Network name for prometheus-oasis-node container connection. Default: prometheus-oasis-node") optional.add_argument('--port', type=str, default='9090', help="Localhost port exposing prometheus web interface. Default: 9090") optional.add_argument('--push-gateway', action='store_true', default=False, @@ -131,7 +131,7 @@ def run_pushgateway(container_name, network_name): args = parser.parse_args() network_name = args.network_name - ekiden_container = args.ekiden_name + oasis_node_container = args.oasis_node_name prometheus_container = args.name exposed_port = args.port prometheus_config = args.config @@ -145,16 +145,16 @@ def run_pushgateway(container_name, network_name): prometheus_config = os.path.abspath(prometheus_config) - # Checked if passed ekiden container is running - if not container_running(ekiden_container): - print("ERROR: Ekiden container: '{}' not running!".format(ekiden_container)) + # Checked if passed Oasis node container is running + if not container_running(oasis_node_container): + print("ERROR: Oasis node container: '{}' not running!".format(oasis_node_container)) sys.exit(1) if not network_exists(network_name): create_network(network_name) - # Connect ekiden container to network (if not yet connected). - connect_container(network_name, ekiden_container) + # Connect Oasis node container to network (if not yet connected). + connect_container(network_name, oasis_node_container) if push_gateway: run_pushgateway("prometheus-pushgateway", network_name)