diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 715c6478417..7103cc1ace1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,7 +21,7 @@ concurrency: env: go_version: '~1.21.8' - tmpnet_data_path: ~/.tmpnet/networks + grafana_url: https://grafana-experimental.avax-dev.network/d/kBQpRdWnk/avalanche-main-dashboard?orgId=1&refresh=10s&var-filter=is_ephemeral_node%7C%3D%7Cfalse&var-filter=gh_repo%7C%3D%7Cava-labs%2Favalanchego&var-filter=gh_run_id%7C%3D%7C${{ github.run_id }}&var-filter=gh_run_attempt%7C%3D%7C${{ github.run_attempt }} jobs: Unit: @@ -67,15 +67,44 @@ jobs: - name: Build AvalancheGo Binary shell: bash run: ./scripts/build.sh -r + - name: Start prometheus + shell: bash + run: bash -x ./scripts/run_prometheus.sh + env: + PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }} + PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }} + - name: Start promtail + shell: bash + run: bash -x ./scripts/run_promtail.sh + env: + LOKI_ID: ${{ secrets.LOKI_ID }} + LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }} + - name: Notify of metrics availability + shell: bash + run: .github/workflows/notify-metrics-availability.sh + env: + GRAFANA_URL: ${{ env.grafana_url }} + GH_JOB_ID: ${{ github.job }} + FILTER_BY_OWNER: avalanchego-e2e - name: Run e2e tests shell: bash run: E2E_SERIAL=1 ./scripts/tests.e2e.sh + env: + GH_REPO: ${{ github.repository }} + GH_WORKFLOW: ${{ github.workflow }} + GH_RUN_ID: ${{ github.run_id }} + GH_RUN_NUMBER: ${{ github.run_number }} + GH_RUN_ATTEMPT: ${{ github.run_attempt }} + GH_JOB_ID: ${{ github.job }} - name: Upload tmpnet network dir uses: actions/upload-artifact@v4 if: always() with: name: e2e-tmpnet-data - path: ${{ env.tmpnet_data_path }} + path: | + ~/.tmpnet/networks + ~/.tmpnet/prometheus/prometheus.log + ~/.tmpnet/promtail/promtail.log if-no-files-found: error e2e_existing_network: runs-on: ubuntu-latest @@ -88,15 +117,43 @@ jobs: - name: Build AvalancheGo Binary shell: bash run: ./scripts/build.sh -r + - name: Start prometheus + shell: bash + run: bash -x ./scripts/run_prometheus.sh + env: + PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }} + PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }} + - name: Start promtail + shell: bash + run: bash -x ./scripts/run_promtail.sh + env: + LOKI_ID: ${{ secrets.LOKI_ID }} + LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }} + - name: Notify of metrics availability + shell: bash + run: .github/workflows/notify-metrics-availability.sh + env: + GRAFANA_URL: ${{ env.grafana_url }} + GH_JOB_ID: ${{ github.job }} - name: Run e2e tests with existing network shell: bash run: E2E_SERIAL=1 ./scripts/tests.e2e.existing.sh + env: + GH_REPO: ${{ github.repository }} + GH_WORKFLOW: ${{ github.workflow }} + GH_RUN_ID: ${{ github.run_id }} + GH_RUN_NUMBER: ${{ github.run_number }} + GH_RUN_ATTEMPT: ${{ github.run_attempt }} + GH_JOB_ID: ${{ github.job }} - name: Upload tmpnet network dir uses: actions/upload-artifact@v4 if: always() with: name: e2e-existing-network-tmpnet-data - path: ${{ env.tmpnet_data_path }} + path: | + ~/.tmpnet/networks + ~/.tmpnet/prometheus/prometheus.log + ~/.tmpnet/promtail/promtail.log if-no-files-found: error Upgrade: runs-on: ubuntu-latest @@ -109,15 +166,43 @@ jobs: - name: Build AvalancheGo Binary shell: bash run: ./scripts/build.sh + - name: Start prometheus + shell: bash + run: bash -x ./scripts/run_prometheus.sh + env: + PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }} + PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }} + - name: Start promtail + shell: bash + run: bash -x ./scripts/run_promtail.sh + env: + LOKI_ID: ${{ secrets.LOKI_ID }} + LOKI_PASSWORD: ${{ secrets.LOKI_PASSWORD }} + - name: Notify of metrics availability + shell: bash + run: .github/workflows/notify-metrics-availability.sh + env: + GRAFANA_URL: ${{ env.grafana_url }} + GH_JOB_ID: ${{ github.job }} - name: Run e2e tests shell: bash run: ./scripts/tests.upgrade.sh + env: + GH_REPO: ${{ github.repository }} + GH_WORKFLOW: ${{ github.workflow }} + GH_RUN_ID: ${{ github.run_id }} + GH_RUN_NUMBER: ${{ github.run_number }} + GH_RUN_ATTEMPT: ${{ github.run_attempt }} + GH_JOB_ID: ${{ github.job }} - name: Upload tmpnet network dir uses: actions/upload-artifact@v4 if: always() with: name: upgrade-tmpnet-data - path: ${{ env.tmpnet_data_path }} + path: | + ~/.tmpnet/networks + ~/.tmpnet/prometheus/prometheus.log + ~/.tmpnet/promtail/promtail.log if-no-files-found: error Lint: runs-on: ubuntu-latest diff --git a/.github/workflows/notify-metrics-availability.sh b/.github/workflows/notify-metrics-availability.sh new file mode 100755 index 00000000000..fd69064045c --- /dev/null +++ b/.github/workflows/notify-metrics-availability.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Timestamps are in seconds +from_timestamp="$(date '+%s')" +monitoring_period=900 # 15 minutes +to_timestamp="$((from_timestamp + monitoring_period))" + +# Grafana expects microseconds, so pad timestamps with 3 zeros +metrics_url="${GRAFANA_URL}&var-filter=gh_job_id%7C%3D%7C${GH_JOB_ID}&from=${from_timestamp}000&to=${to_timestamp}000" + +# Optionally ensure that the link displays metrics only for the shared +# network rather than mixing it with the results for private networks. +if [[ -n "${FILTER_BY_OWNER:-}" ]]; then + metrics_url="${metrics_url}&var-filter=network_owner%7C%3D%7C${FILTER_BY_OWNER}" +fi + +echo "::notice links::metrics ${metrics_url}" diff --git a/scripts/run_prometheus.sh b/scripts/run_prometheus.sh new file mode 100755 index 00000000000..19522272310 --- /dev/null +++ b/scripts/run_prometheus.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Starts a prometheus instance in agent-mode, forwarding to a central +# instance. Intended to enable metrics collection from temporary networks running +# locally and in CI. +# +# The prometheus instance will remain running in the background and will forward +# metrics to the central instance for all tmpnet networks. +# +# To stop it: +# +# $ kill -9 `cat ~/.tmpnet/prometheus/run.pid` && rm ~/.tmpnet/prometheus/run.pid +# + +# e.g., +# PROMETHEUS_ID= PROMETHEUS_PASSWORD= ./scripts/run_prometheus.sh +if ! [[ "$0" =~ scripts/run_prometheus.sh ]]; then + echo "must be run from repository root" + exit 255 +fi + +PROMETHEUS_WORKING_DIR="${HOME}/.tmpnet/prometheus" +PIDFILE="${PROMETHEUS_WORKING_DIR}"/run.pid + +# First check if an agent-mode prometheus is already running. A single instance can collect +# metrics from all local temporary networks. +if pgrep --pidfile="${PIDFILE}" -f 'prometheus.*enable-feature=agent' &> /dev/null; then + echo "prometheus is already running locally with --enable-feature=agent" + exit 0 +fi + +PROMETHEUS_URL="${PROMETHEUS_URL:-https://prometheus-experimental.avax-dev.network}" +if [[ -z "${PROMETHEUS_URL}" ]]; then + echo "Please provide a value for PROMETHEUS_URL" + exit 1 +fi + +PROMETHEUS_ID="${PROMETHEUS_ID:-}" +if [[ -z "${PROMETHEUS_ID}" ]]; then + echo "Please provide a value for PROMETHEUS_ID" + exit 1 +fi + +PROMETHEUS_PASSWORD="${PROMETHEUS_PASSWORD:-}" +if [[ -z "${PROMETHEUS_PASSWORD}" ]]; then + echo "Plase provide a value for PROMETHEUS_PASSWORD" + exit 1 +fi + +# This was the LTS version when this script was written. Probably not +# much reason to update it unless something breaks since the usage +# here is only to collect metrics from temporary networks. +VERSION="2.45.3" + +# Ensure the prometheus command is locally available +CMD=prometheus +if ! command -v "${CMD}" &> /dev/null; then + # Try to use a local version + CMD="${PWD}/bin/prometheus" + if ! command -v "${CMD}" &> /dev/null; then + echo "prometheus not found, attempting to install..." + + # Determine the arch + if which sw_vers &> /dev/null; then + echo "on macos, only amd64 binaries are available so rosetta is required on apple silicon machines." + echo "to avoid using rosetta, install via homebrew: brew install prometheus" + DIST=darwin + else + ARCH="$(uname -i)" + if [[ "${ARCH}" != "x86_64" ]]; then + echo "on linux, only amd64 binaries are available. manual installation of prometheus is required." + exit 1 + else + DIST="linux" + fi + fi + + # Install the specified release + PROMETHEUS_FILE="prometheus-${VERSION}.${DIST}-amd64" + URL="https://github.com/prometheus/prometheus/releases/download/v${VERSION}/${PROMETHEUS_FILE}.tar.gz" + curl -s -L "${URL}" | tar zxv -C /tmp > /dev/null + mkdir -p "$(dirname "${CMD}")" + cp /tmp/"${PROMETHEUS_FILE}/prometheus" "${CMD}" + fi +fi + +# Configure prometheus +FILE_SD_PATH="${PROMETHEUS_WORKING_DIR}/file_sd_configs" +mkdir -p "${FILE_SD_PATH}" + +echo "writing configuration..." +cat >"${PROMETHEUS_WORKING_DIR}"/prometheus.yaml < prometheus.log 2>&1 & +echo $! > "${PIDFILE}" +echo "running with pid $(cat "${PIDFILE}")" diff --git a/scripts/run_promtail.sh b/scripts/run_promtail.sh new file mode 100755 index 00000000000..9b386d3d55f --- /dev/null +++ b/scripts/run_promtail.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Starts a promtail instance to collect logs from temporary networks +# running locally and in CI. +# +# The promtail instance will remain running in the background and will forward +# logs to the central instance for all tmpnet networks. +# +# To stop it: +# +# $ kill -9 `cat ~/.tmpnet/promtail/run.pid` && rm ~/.tmpnet/promtail/run.pid +# + +# e.g., +# LOKI_ID= LOKI_PASSWORD= ./scripts/run_promtail.sh +if ! [[ "$0" =~ scripts/run_promtail.sh ]]; then + echo "must be run from repository root" + exit 255 +fi + +PROMTAIL_WORKING_DIR="${HOME}/.tmpnet/promtail" +PIDFILE="${PROMTAIL_WORKING_DIR}"/run.pid + +# First check if promtail is already running. A single instance can +# collect logs from all local temporary networks. +if pgrep --pidfile="${PIDFILE}" &> /dev/null; then + echo "promtail is already running" + exit 0 +fi + +LOKI_URL="${LOKI_URL:-https://loki-experimental.avax-dev.network}" +if [[ -z "${LOKI_URL}" ]]; then + echo "Please provide a value for LOKI_URL" + exit 1 +fi + +LOKI_ID="${LOKI_ID:-}" +if [[ -z "${LOKI_ID}" ]]; then + echo "Please provide a value for LOKI_ID" + exit 1 +fi + +LOKI_PASSWORD="${LOKI_PASSWORD:-}" +if [[ -z "${LOKI_PASSWORD}" ]]; then + echo "Plase provide a value for LOKI_PASSWORD" + exit 1 +fi + +# Version as of this writing +VERSION="v2.9.5" + +# Ensure the promtail command is locally available +CMD=promtail +if ! command -v "${CMD}" &> /dev/null; then + # Try to use a local version + CMD="${PWD}/bin/promtail" + if ! command -v "${CMD}" &> /dev/null; then + echo "promtail not found, attempting to install..." + # Determine the arch + if which sw_vers &> /dev/null; then + DIST="darwin-$(uname -m)" + else + ARCH="$(uname -i)" + if [[ "${ARCH}" == "aarch64" ]]; then + ARCH="arm64" + elif [[ "${ARCH}" == "x86_64" ]]; then + ARCH="amd64" + fi + DIST="linux-${ARCH}" + fi + + # Install the specified release + PROMTAIL_FILE="promtail-${DIST}" + ZIP_PATH="/tmp/${PROMTAIL_FILE}.zip" + BIN_DIR="$(dirname "${CMD}")" + URL="https://github.com/grafana/loki/releases/download/${VERSION}/promtail-${DIST}.zip" + curl -L -o "${ZIP_PATH}" "${URL}" + unzip "${ZIP_PATH}" -d "${BIN_DIR}" + mv "${BIN_DIR}/${PROMTAIL_FILE}" "${CMD}" + fi +fi + +# Configure promtail +FILE_SD_PATH="${PROMTAIL_WORKING_DIR}/file_sd_configs" +mkdir -p "${FILE_SD_PATH}" + +echo "writing configuration..." +cat >"${PROMTAIL_WORKING_DIR}"/promtail.yaml < promtail.log 2>&1 & +echo $! > "${PIDFILE}" +echo "running with pid $(cat "${PIDFILE}")" diff --git a/tests/fixture/e2e/env.go b/tests/fixture/e2e/env.go index 036e5218f62..d445f4de24a 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/e2e/env.go @@ -40,6 +40,10 @@ type TestEnvironment struct { URIs []tmpnet.NodeURI // The URI used to access the http server that allocates test data TestDataServerURI string + // The duration to wait before shutting down private networks. A + // non-zero value may be useful to ensure all metrics can be + // scraped before shutdown. + PrivateNetworkShutdownDelay time.Duration require *require.Assertions } @@ -74,7 +78,7 @@ func NewTestEnvironment(flagVars *FlagVars, desiredNetwork *tmpnet.Network) *Tes } } else { network = desiredNetwork - StartNetwork(network, flagVars.AvalancheGoExecPath(), flagVars.PluginDir()) + StartNetwork(network, flagVars.AvalancheGoExecPath(), flagVars.PluginDir(), flagVars.NetworkShutdownDelay()) } // A new network will always need subnet creation and an existing @@ -112,10 +116,11 @@ func NewTestEnvironment(flagVars *FlagVars, desiredNetwork *tmpnet.Network) *Tes require.NoError(err) return &TestEnvironment{ - NetworkDir: network.Dir, - URIs: uris, - TestDataServerURI: testDataServerURI, - require: require, + NetworkDir: network.Dir, + URIs: uris, + TestDataServerURI: testDataServerURI, + PrivateNetworkShutdownDelay: flagVars.NetworkShutdownDelay(), + require: require, } } @@ -167,5 +172,6 @@ func (te *TestEnvironment) StartPrivateNetwork(network *tmpnet.Network) { network, sharedNetwork.DefaultRuntimeConfig.AvalancheGoPath, pluginDir, + te.PrivateNetworkShutdownDelay, ) } diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/e2e/flags.go index 2a00df97a88..4a0e12add4d 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/e2e/flags.go @@ -7,15 +7,17 @@ import ( "flag" "fmt" "os" + "time" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" ) type FlagVars struct { - avalancheGoExecPath string - pluginDir string - networkDir string - useExistingNetwork bool + avalancheGoExecPath string + pluginDir string + networkDir string + useExistingNetwork bool + networkShutdownDelay time.Duration } func (v *FlagVars) AvalancheGoExecPath() string { @@ -40,6 +42,10 @@ func (v *FlagVars) UseExistingNetwork() bool { return v.useExistingNetwork } +func (v *FlagVars) NetworkShutdownDelay() time.Duration { + return v.networkShutdownDelay +} + func RegisterFlags() *FlagVars { vars := FlagVars{} flag.StringVar( @@ -66,6 +72,12 @@ func RegisterFlags() *FlagVars { false, "[optional] whether to target the existing network identified by --network-dir.", ) + flag.DurationVar( + &vars.networkShutdownDelay, + "network-shutdown-delay", + 12*time.Second, // Make sure this value takes into account the scrape_interval defined in scripts/run_prometheus.sh + "[optional] the duration to wait before shutting down the test network at the end of the test run. A value greater than the scrape interval is suggested. 0 avoids waiting for shutdown.", + ) return &vars } diff --git a/tests/fixture/e2e/helpers.go b/tests/fixture/e2e/helpers.go index c2382cee9c7..0adffd60c7f 100644 --- a/tests/fixture/e2e/helpers.go +++ b/tests/fixture/e2e/helpers.go @@ -216,7 +216,7 @@ func CheckBootstrapIsPossible(network *tmpnet.Network) { } // Start a temporary network with the provided avalanchego binary. -func StartNetwork(network *tmpnet.Network, avalancheGoExecPath string, pluginDir string) { +func StartNetwork(network *tmpnet.Network, avalancheGoExecPath string, pluginDir string, shutdownDelay time.Duration) { require := require.New(ginkgo.GinkgoT()) require.NoError( @@ -232,6 +232,11 @@ func StartNetwork(network *tmpnet.Network, avalancheGoExecPath string, pluginDir ) ginkgo.DeferCleanup(func() { + if shutdownDelay > 0 { + tests.Outf("Waiting %s before network shutdown to ensure final metrics scrape\n", shutdownDelay) + time.Sleep(shutdownDelay) + } + tests.Outf("Shutting down network\n") ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) defer cancel() diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index bfeebf93b98..9c678c35d96 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -128,6 +128,10 @@ A temporary network relies on configuration written to disk in the following str ``` HOME └── .tmpnet // Root path for the temporary network fixture + ├── prometheus // Working directory for a metrics-scraping prometheus instance + │ └── file_sd_configs // Directory containing file-based service discovery config for prometheus + ├── promtail // Working directory for a log-collecting promtail instance + │ └── file_sd_configs // Directory containing file-based service discovery config for promtail └── networks // Default parent directory for temporary networks └── 20240306-152305.924531 // The timestamp of creation is the name of a network's directory ├── NodeID-37E8UK3x2YFsHE3RdALmfWcppcZ1eTuj9 // The ID of a node is the name of its data dir @@ -229,3 +233,44 @@ The process details of a node are written by avalanchego to `[base-data-dir]/process.json`. The file contains the PID of the node process, the URI of the node's API, and the address other nodes can use to bootstrap themselves (aka staking address). + +## Metrics + +### Prometheus configuration + +When nodes are started, prometheus configuration for each node is +written to `~/.tmpnet/prometheus/file_sd_configs/` with a filename of +`[network uuid]-[node id].json`. Prometheus can be configured to +scrape the nodes as per the following example: + +```yaml +scrape_configs: + - job_name: "avalanchego" + metrics_path: "/ext/metrics" + file_sd_configs: + - files: + - '/home/me/.tmpnet/prometheus/file_sd_configs/*.yaml' +``` + +### Viewing metrics + +When a network is started with `tmpnet`, a grafana link for the +network's metrics will be emitted. + +The metrics emitted by temporary networks configured with tmpnet will +have the following labels applied: + + - `network_uuid` + - `node_id` + - `is_ephemeral_node` + - `network_owner` + +When a tmpnet network runs as part of github CI, the following +additional labels will be applied: + + - `gh_repo` + - `gh_workflow` + - `gh_run_id` + - `gh_run_number` + - `gh_run_attempt` + - `gh_job_id` diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index 954162c3fcf..ceff7bcff98 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -71,7 +71,9 @@ type Network struct { // unique network ID values across all temporary networks. UUID string - // A string identifying the entity that started or maintains this network. + // A string identifying the entity that started or maintains this + // network. Useful for differentiating between networks when a + // given CI job uses multiple networks. Owner string // Path where network configuration and data is stored @@ -230,7 +232,7 @@ func (n *Network) Create(rootDir string) error { if len(rootDir) == 0 { // Use the default root dir var err error - rootDir, err = getDefaultRootDir() + rootDir, err = getDefaultRootNetworkDir() if err != nil { return err } @@ -305,6 +307,9 @@ func (n *Network) Start(ctx context.Context, w io.Writer) error { return err } + // Record the time before nodes are started to ensure visibility of subsequently collected metrics via the emitted link + startTime := time.Now() + // Configure the networking for each node and start for _, node := range n.Nodes { if err := n.StartNode(ctx, w, node); err != nil { @@ -321,6 +326,10 @@ func (n *Network) Start(ctx context.Context, w io.Writer) error { if _, err := fmt.Fprintf(w, "\nStarted network %s (UUID: %s)\n", n.Dir, n.UUID); err != nil { return err } + // Provide a link to the main dashboard filtered by the uuid and showing results from now till whenever the link is viewed + if _, err := fmt.Fprintf(w, "\nMetrics: https://grafana-experimental.avax-dev.network/d/kBQpRdWnk/avalanche-main-dashboard?&var-filter=network_uuid%%7C%%3D%%7C%s&var-filter=is_ephemeral_node%%7C%%3D%%7Cfalse&from=%d&to=now\n", n.UUID, startTime.UnixMilli()); err != nil { + return err + } return nil } @@ -455,6 +464,12 @@ func (n *Network) Restart(ctx context.Context, w io.Writer) error { func (n *Network) EnsureNodeConfig(node *Node) error { flags := node.Flags + // Ensure nodes can label their metrics with the network uuid + node.NetworkUUID = n.UUID + + // Ensure nodes can label metrics with an indication of the shared/private nature of the network + node.NetworkOwner = n.Owner + // Set the network name if available if n.Genesis != nil && n.Genesis.NetworkID > 0 { // Convert the network id to a string to ensure consistency in JSON round-tripping. @@ -672,12 +687,21 @@ func (n *Network) getBootstrapIPsAndIDs(skippedNode *Node) ([]string, []string, return bootstrapIPs, bootstrapIDs, nil } +// Retrieves the root dir for tmpnet data. +func getTmpnetPath() (string, error) { + homeDir, err := os.UserHomeDir() + if err != nil { + return "", err + } + return filepath.Join(homeDir, ".tmpnet"), nil +} + // Retrieves the default root dir for storing networks and their // configuration. -func getDefaultRootDir() (string, error) { - homeDir, err := os.UserHomeDir() +func getDefaultRootNetworkDir() (string, error) { + tmpnetPath, err := getTmpnetPath() if err != nil { return "", err } - return filepath.Join(homeDir, ".tmpnet", "networks"), nil + return filepath.Join(tmpnetPath, "networks"), nil } diff --git a/tests/fixture/tmpnet/node.go b/tests/fixture/tmpnet/node.go index 99c0aeb3ebc..10f80371cbf 100644 --- a/tests/fixture/tmpnet/node.go +++ b/tests/fixture/tmpnet/node.go @@ -53,6 +53,16 @@ type NodeRuntimeConfig struct { // Node supports configuring and running a node participating in a temporary network. type Node struct { + // Uniquely identifies the network the node is part of to enable monitoring. + NetworkUUID string + + // Identify the entity associated with this network. This is + // intended to be used to label metrics to enable filtering + // results for a test run between the primary/shared network used + // by the majority of tests and private networks used by + // individual tests. + NetworkOwner string + // Set by EnsureNodeID which is also called when the node is read. NodeID ids.NodeID diff --git a/tests/fixture/tmpnet/node_config.go b/tests/fixture/tmpnet/node_config.go index 3ebbc01b6c3..1f47c926dfc 100644 --- a/tests/fixture/tmpnet/node_config.go +++ b/tests/fixture/tmpnet/node_config.go @@ -61,12 +61,16 @@ func (n *Node) readConfig() error { } type serializedNodeConfig struct { + NetworkUUID string + NetworkOwner string IsEphemeral bool RuntimeConfig *NodeRuntimeConfig } func (n *Node) writeConfig() error { config := serializedNodeConfig{ + NetworkUUID: n.NetworkUUID, + NetworkOwner: n.NetworkOwner, IsEphemeral: n.IsEphemeral, RuntimeConfig: n.RuntimeConfig, } diff --git a/tests/fixture/tmpnet/node_process.go b/tests/fixture/tmpnet/node_process.go index c2e2e33139b..dc5c5bfebf1 100644 --- a/tests/fixture/tmpnet/node_process.go +++ b/tests/fixture/tmpnet/node_process.go @@ -14,12 +14,15 @@ import ( "os" "os/exec" "path/filepath" + "strconv" + "strings" "syscall" "time" "github.com/ava-labs/avalanchego/api/health" "github.com/ava-labs/avalanchego/config" "github.com/ava-labs/avalanchego/node" + "github.com/ava-labs/avalanchego/utils/perms" ) const ( @@ -142,8 +145,12 @@ func (p *NodeProcess) Start(w io.Writer) error { return fmt.Errorf("failed to start local node: %w", err) } - _, err = fmt.Fprintf(w, "Started %s\n", nodeDescription) - return err + if _, err = fmt.Fprintf(w, "Started %s\n", nodeDescription); err != nil { + return err + } + + // Configure collection of metrics and logs + return p.writeMonitoringConfig() } // Signals the node process to stop. @@ -154,7 +161,7 @@ func (p *NodeProcess) InitiateStop() error { } if proc == nil { // Already stopped - return nil + return p.removeMonitoringConfig() } if err := proc.Signal(syscall.SIGTERM); err != nil { return fmt.Errorf("failed to send SIGTERM to pid %d: %w", p.pid, err) @@ -172,7 +179,7 @@ func (p *NodeProcess) WaitForStopped(ctx context.Context) error { return fmt.Errorf("failed to retrieve process: %w", err) } if proc == nil { - return nil + return p.removeMonitoringConfig() } select { @@ -256,3 +263,95 @@ func (p *NodeProcess) getProcess() (*os.Process, error) { } return nil, fmt.Errorf("failed to determine process status: %w", err) } + +// Write monitoring configuration enabling collection of metrics and logs from the node. +func (p *NodeProcess) writeMonitoringConfig() error { + // Ensure labeling that uniquely identifies the node and its network + commonLabels := FlagsMap{ + "network_uuid": p.node.NetworkUUID, + "node_id": p.node.NodeID, + "is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral), + "network_owner": p.node.NetworkOwner, + // prometheus/promtail ignore empty values so including these + // labels with empty values outside of a github worker (where + // the env vars will not be set) should not be a problem. + "gh_repo": os.Getenv("GH_REPO"), + "gh_workflow": os.Getenv("GH_WORKFLOW"), + "gh_run_id": os.Getenv("GH_RUN_ID"), + "gh_run_number": os.Getenv("GH_RUN_NUMBER"), + "gh_run_attempt": os.Getenv("GH_RUN_ATTEMPT"), + "gh_job_id": os.Getenv("GH_JOB_ID"), + } + + tmpnetDir, err := getTmpnetPath() + if err != nil { + return err + } + + prometheusConfig := []FlagsMap{ + { + "targets": []string{strings.TrimPrefix(p.node.URI, "http://")}, + "labels": commonLabels, + }, + } + if err := p.writeMonitoringConfigFile(tmpnetDir, "prometheus", prometheusConfig); err != nil { + return err + } + + promtailLabels := FlagsMap{ + "__path__": filepath.Join(p.node.getDataDir(), "logs", "*.log"), + } + promtailLabels.SetDefaults(commonLabels) + promtailConfig := []FlagsMap{ + { + "targets": []string{"localhost"}, + "labels": promtailLabels, + }, + } + return p.writeMonitoringConfigFile(tmpnetDir, "promtail", promtailConfig) +} + +// Return the path for this node's prometheus configuration. +func (p *NodeProcess) getMonitoringConfigPath(tmpnetDir string, name string) string { + // Ensure a unique filename to allow config files to be added and removed + // by multiple nodes without conflict. + return filepath.Join(tmpnetDir, name, "file_sd_configs", fmt.Sprintf("%s_%s.json", p.node.NetworkUUID, p.node.NodeID)) +} + +// Ensure the removal of the prometheus configuration file for this node. +func (p *NodeProcess) removeMonitoringConfig() error { + tmpnetDir, err := getTmpnetPath() + if err != nil { + return err + } + + for _, name := range []string{"promtail", "prometheus"} { + configPath := p.getMonitoringConfigPath(tmpnetDir, name) + if err := os.Remove(configPath); err != nil && !errors.Is(err, fs.ErrNotExist) { + return fmt.Errorf("failed to remove %s config: %w", name, err) + } + } + + return nil +} + +// Write the configuration for a type of monitoring (e.g. prometheus, promtail). +func (p *NodeProcess) writeMonitoringConfigFile(tmpnetDir string, name string, config []FlagsMap) error { + configPath := p.getMonitoringConfigPath(tmpnetDir, name) + + dir := filepath.Dir(configPath) + if err := os.MkdirAll(dir, perms.ReadWriteExecute); err != nil { + return fmt.Errorf("failed to create %s service discovery dir: %w", name, err) + } + + bytes, err := DefaultJSONMarshal(config) + if err != nil { + return fmt.Errorf("failed to marshal %s config: %w", name, err) + } + + if err := os.WriteFile(configPath, bytes, perms.ReadWrite); err != nil { + return fmt.Errorf("failed to write %s config: %w", name, err) + } + + return nil +} diff --git a/tests/upgrade/upgrade_test.go b/tests/upgrade/upgrade_test.go index ed4bfd0fa13..3e4509fa2c9 100644 --- a/tests/upgrade/upgrade_test.go +++ b/tests/upgrade/upgrade_test.go @@ -48,7 +48,7 @@ var _ = ginkgo.Describe("[Upgrade]", func() { network := &tmpnet.Network{ Owner: "avalanchego-upgrade", } - e2e.StartNetwork(network, avalancheGoExecPath, "" /* pluginDir */) + e2e.StartNetwork(network, avalancheGoExecPath, "" /* pluginDir */, 0 /* shutdownDelay */) ginkgo.By(fmt.Sprintf("restarting all nodes with %q binary", avalancheGoExecPathToUpgradeTo)) for _, node := range network.Nodes {