Skip to content

Commit

Permalink
tmpnet: Write config enabling metrics collection by prometheus
Browse files Browse the repository at this point in the history
Temporary networks used for testing previously lacked an easy way to
enable metrics collection. This PR ensures that prometheus has what it
needs to scrape the metrics endpoints of a temporary network and
enables scraping of CI jobs using temporary networks.

- Write prometheus configuration to ~/.tmpnet/prometheus/file_sd_configs for
  each node on startup and remove it on shutdown. This enables scraping
  of all nodes in a network no matter when they are started.

  Ref: https://prometheus.io/docs/guides/file-sd/

- Add script to scrape temporary networks with agent-mode
  prometheus. Works locally and in CI
  • Loading branch information
marun committed Mar 7, 2024
1 parent d3c4322 commit bdaf176
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 17 deletions.
11 changes: 11 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,17 @@ jobs:
- name: Build AvalancheGo Binary
shell: bash
run: ./scripts/build.sh -r
- name: Start prometheus
shell: bash
run: ./scripts/run_prometheus.sh
env:
PROMETHEUS_ID: ${{ secrets.PROMETHEUS_ID }}
PROMETHEUS_PASSWORD: ${{ secrets.PROMETHEUS_PASSWORD }}
GH_REPO: ${{ github.repository }}
GH_JOB_ID: ${{ github.job }}
GH_RUN_ID: ${{ github.run_id }}
GH_RUN_NUMBER: ${{ github.run_id }}
GH_RUN_ATTEMPT: ${{ github.run_attempt }}
- name: Run e2e tests
shell: bash
run: E2E_SERIAL=1 ./scripts/tests.e2e.sh
Expand Down
131 changes: 131 additions & 0 deletions scripts/run_prometheus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env bash

set -euo pipefail

# Starts a prometheus instance in agent-mode, forwarding to a central
# instance. Intended to enable metrics collection from temporary networks running
# locally and in CI.
#
# The prometheus instance will remain running in the background and will forward
# metrics to the central instance for all tmpnet networks.
#
# To stop it:
#
# $ kill -9 `cat ~/.tmpnet/prometheus/run.pid` && rm ~/.tmpnet/prometheus/run.pid
#

# e.g.,
# PROMETHEUS_ID=<id> PROMETHEUS_PASSWORD=<password> ./scripts/run_prometheus.sh
# GH_{run,job_id,etc}=<value> PROMETHEUS_ID=<id> PROMETHEUS_PASSWORD=<password> ./scripts/run_prometheus.sh # Configure with github labels
if ! [[ "$0" =~ scripts/run_prometheus.sh ]]; then
echo "must be run from repository root"
exit 255
fi

# First check if an agent-mode prometheus is already running. Once
# instance can collect metrics from all the local temporary networks.
if ps aux | grep prometheus | grep enable-feature=agent &> /dev/null; then
echo "prometheus is already running locally with --enable-feature=agent"
exit 0
fi

PROMETHEUS_URL="${PROMETHEUS_URL:-\"https://prometheus-experimental.avax-dev.network\"}"
if [[ -z "${PROMETHEUS_URL}" ]]; then
echo "Please provide a value for PROMETHEUS_URL"
exit 1
fi

PROMETHEUS_ID="${PROMETHEUS_ID:-}"
if [[ -z "${PROMETHEUS_ID}" ]]; then
echo "Please provide a value for PROMETHEUS_ID"
exit 1
fi

PROMETHEUS_PASSWORD="${PROMETHEUS_PASSWORD:-}"
if [[ -z "${PROMETHEUS_PASSWORD}" ]]; then
echo "Plase provide a value for PROMETHEUS_PASSWORD"
exit 1
fi

# This was the LTS version when this script was written. Probably not
# much reason to update it unless something breaks since the usage
# here is only to collect metrics from temporary networks.
VERSION="2.45.3"

# Ensure the prometheus command is locally available
CMD=prometheus
if ! command -v "${CMD}" &> /dev/null; then
# Try to use a local version
CMD=./bin/prometheus
if ! command -v "${CMD}" &> /dev/null; then
echo "prometheus not found, attempting to install..."

# Determine the arch
if which sw_vers &> /dev/null; then
echo "on macos, only amd64 binaries are available so rosetta is required on apple silicon machines."
echo "to avoid using rosetta, install via homebrew: brew install prometheus"
DIST=darwin
else
ARCH="$(uname -i)"
if [[ "${ARCH}" != "x86-64" ]]; then
echo "on linux, only amd64 binaries are available. manual installation of prometheus is required."
exit 1
else
DIST="linux"
fi
fi

# Install the specified release
PROMETHEUS_FILE="prometheus-${VERSION}.${DIST}-amd64"
URL="https://github.com/prometheus/prometheus/releases/download/v${VERSION}/${PROMETHEUS_FILE}.tar.gz"
curl -s -L "${URL}" | tar zxv -C /tmp > /dev/null
mkdir -p "$(dirname "${CMD}")"
cp /tmp/"${PROMETHEUS_FILE}/prometheus" "${CMD}"
fi
fi

if [[ -n "${GH_REPO:-}" ]]; then
LABELS="$(cat <<-END
labels:
- gh_repo: "${GH_REPO}"
- gh_job_id: "${GH_JOB_ID:-}"
- gh_run_id: "${GH_RUN_ID:-}"
- gh_run_number: "${GH_RUN_NUMBER:-}"
- gh_run_attempt: "${GH_RUN_ATTEMPT:-}"
END
)"
fi

# Configure prometheus
PROMETHEUS_WORKING_DIR="${HOME}/.tmpnet/prometheus"
FILE_SD_PATH="${PROMETHEUS_WORKING_DIR}/file_sd_configs"
mkdir -p "${FILE_SD_PATH}"

echo "writing configuration..."
cat >"${PROMETHEUS_WORKING_DIR}"/prometheus2.yaml <<EOL
# my global config
global:
scrape_interval: 10s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 10s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_timeout: 5s # The default is every 10s
scrape_configs:
- job_name: "avalanchego"
metrics_path: "/ext/metrics"
file_sd_configs:
- files:
- '${FILE_SD_PATH}/*.yaml'
${LABELS:-}
remote_write:
- url: "${PROMETHEUS_URL}/api/v1/write"
basic_auth:
username: "${PROMETHEUS_ID}"
password: "${PROMETHEUS_PASSWORD}"
EOL
exit 0

echo "starting prometheus..."
cd ${PROMETHEUS_WORKING_DIR}
nohup ${CMD} --config.file=prometheus.yaml --web.listen-address=localhost:0 --enable-feature=agent &> /dev/null &
echo $! > ${PROMETHEUS_WORKING_DIR}/run.pid
16 changes: 11 additions & 5 deletions tests/fixture/e2e/env.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ type TestEnvironment struct {
URIs []tmpnet.NodeURI
// The URI used to access the http server that allocates test data
TestDataServerURI string
// The number of seconds to wait before shutting down private
// networks. A non-zero value may be useful to ensure all metrics
// can be scraped before shutdown.
PrivateNetworkShutdownDelay uint

require *require.Assertions
}
Expand Down Expand Up @@ -74,7 +78,7 @@ func NewTestEnvironment(flagVars *FlagVars, networkDirSuffix string, desiredNetw
}
} else {
network = desiredNetwork
StartNetwork(network, networkDirSuffix, flagVars.AvalancheGoExecPath(), flagVars.PluginDir())
StartNetwork(network, networkDirSuffix, flagVars.AvalancheGoExecPath(), flagVars.PluginDir(), flagVars.NetworkShutdownDelay())
}

// A new network will always need subnet creation and an existing
Expand Down Expand Up @@ -112,10 +116,11 @@ func NewTestEnvironment(flagVars *FlagVars, networkDirSuffix string, desiredNetw
require.NoError(err)

return &TestEnvironment{
NetworkDir: network.Dir,
URIs: uris,
TestDataServerURI: testDataServerURI,
require: require,
NetworkDir: network.Dir,
URIs: uris,
TestDataServerURI: testDataServerURI,
PrivateNetworkShutdownDelay: flagVars.NetworkShutdownDelay(),
require: require,
}
}

Expand Down Expand Up @@ -169,6 +174,7 @@ func (te *TestEnvironment) NewPrivateNetwork(networkDirSuffix string) *tmpnet.Ne
networkDirSuffix,
sharedNetwork.DefaultRuntimeConfig.AvalancheGoPath,
pluginDir,
te.PrivateNetworkShutdownDelay,
)

return network
Expand Down
19 changes: 15 additions & 4 deletions tests/fixture/e2e/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ import (
)

type FlagVars struct {
avalancheGoExecPath string
pluginDir string
networkDir string
useExistingNetwork bool
avalancheGoExecPath string
pluginDir string
networkDir string
useExistingNetwork bool
networkShutdownDelay uint
}

func (v *FlagVars) AvalancheGoExecPath() string {
Expand All @@ -40,6 +41,10 @@ func (v *FlagVars) UseExistingNetwork() bool {
return v.useExistingNetwork
}

func (v *FlagVars) NetworkShutdownDelay() uint {
return v.networkShutdownDelay
}

func RegisterFlags() *FlagVars {
vars := FlagVars{}
flag.StringVar(
Expand All @@ -66,6 +71,12 @@ func RegisterFlags() *FlagVars {
false,
"[optional] whether to target the existing network identified by --network-dir.",
)
flag.UintVar(
&vars.networkShutdownDelay,
"network-shutdown-delay",
0,
"[optional] the number of seconds to wait before shutting down the test network at the end of the test run. If collecting metrics, a value greater than the scrape interval is suggested.",
)

return &vars
}
7 changes: 6 additions & 1 deletion tests/fixture/e2e/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ func CheckBootstrapIsPossible(network *tmpnet.Network) {
}

// Start a temporary network with the provided avalanchego binary.
func StartNetwork(network *tmpnet.Network, networkDirSuffix string, avalancheGoExecPath string, pluginDir string) {
func StartNetwork(network *tmpnet.Network, networkDirSuffix string, avalancheGoExecPath string, pluginDir string, shutdownDelay uint) {
require := require.New(ginkgo.GinkgoT())

require.NoError(
Expand All @@ -233,6 +233,11 @@ func StartNetwork(network *tmpnet.Network, networkDirSuffix string, avalancheGoE
)

ginkgo.DeferCleanup(func() {
if shutdownDelay > 0 {
tests.Outf("Waiting %d seconds before network shutdown to ensure final metrics scrape\n", shutdownDelay)
time.Sleep(time.Duration(shutdownDelay) * time.Second)
}

tests.Outf("Shutting down network\n")
ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout)
defer cancel()
Expand Down
2 changes: 2 additions & 0 deletions tests/fixture/tmpnet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ A temporary network relies on configuration written to disk in the following str
```
HOME
└── .tmpnet // Root path for the temporary network fixture
├── prometheus // Working directory for a metrics-scraping prometheus instance
│ └── file_sd_configs // Directory containing file-based service discovery config for prometheus
└── networks // Default parent directory for temporary networks
└── 20240306-152305.924531 // The timestamp of creation is the name of a network's directory
├── NodeID-37E8UK3x2YFsHE3RdALmfWcppcZ1eTuj9 // The ID of a node is the name of its data dir
Expand Down
17 changes: 15 additions & 2 deletions tests/fixture/tmpnet/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ func (n *Network) Create(rootDir string, networkDirSuffix string) error {
if len(rootDir) == 0 {
// Use the default root dir
var err error
rootDir, err = getDefaultRootDir()
rootDir, err = getDefaultRootNetworkDir()
if err != nil {
return err
}
Expand Down Expand Up @@ -452,6 +452,9 @@ func (n *Network) Restart(ctx context.Context, w io.Writer) error {
func (n *Network) EnsureNodeConfig(node *Node) error {
flags := node.Flags

// Ensure nodes can write include the network uuid in their monitoring configuration
node.NetworkUUID = n.UUID

// Set the network name if available
if n.Genesis != nil && n.Genesis.NetworkID > 0 {
// Convert the network id to a string to ensure consistency in JSON round-tripping.
Expand Down Expand Up @@ -671,10 +674,20 @@ func (n *Network) getBootstrapIPsAndIDs(skippedNode *Node) ([]string, []string,

// Retrieves the default root dir for storing networks and their
// configuration.
func getDefaultRootDir() (string, error) {
func getDefaultRootNetworkDir() (string, error) {
homeDir, err := os.UserHomeDir()
if err != nil {
return "", err
}
return filepath.Join(homeDir, ".tmpnet", "networks"), nil
}

// Retrieves the default dir for writing service discovery
// configuration for prometheus.
func getPrometheusServiceDiscoveryDir() (string, error) {
homeDir, err := os.UserHomeDir()
if err != nil {
return "", err
}
return filepath.Join(homeDir, ".tmpnet", "prometheus", "file_sd_configs"), nil
}
3 changes: 3 additions & 0 deletions tests/fixture/tmpnet/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ type NodeRuntimeConfig struct {

// Node supports configuring and running a node participating in a temporary network.
type Node struct {
// Uniquely identifies the network the node is part of to enable monitoring.
NetworkUUID string

// Set by EnsureNodeID which is also called when the node is read.
NodeID ids.NodeID

Expand Down
2 changes: 2 additions & 0 deletions tests/fixture/tmpnet/node_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,14 @@ func (n *Node) readConfig() error {
}

type serializedNodeConfig struct {
NetworkUUID string
IsEphemeral bool
RuntimeConfig *NodeRuntimeConfig
}

func (n *Node) writeConfig() error {
config := serializedNodeConfig{
NetworkUUID: n.NetworkUUID,
IsEphemeral: n.IsEphemeral,
RuntimeConfig: n.RuntimeConfig,
}
Expand Down
Loading

0 comments on commit bdaf176

Please sign in to comment.