Skip to content

Commit

Permalink
update metrics status scripts (#31037)
Browse files Browse the repository at this point in the history
* update metrics status scripts

* add exit condition
  • Loading branch information
joeaba authored Apr 4, 2023
1 parent b0540ff commit 80b2572
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 42 deletions.
6 changes: 3 additions & 3 deletions metrics/metrics-internal/chronograf_8888_internal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
cd "$(dirname "$0")"

if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"

Expand Down Expand Up @@ -34,7 +34,7 @@ sudo chown buildkite-agent:buildkite-agent certs


# (Re)start the container
sudo sudo docker run \
sudo docker run \
--detach \
--env AUTH_DURATION=24h \
--env TLS_CERTIFICATE=/certs/fullchain.pem \
Expand All @@ -53,4 +53,4 @@ sudo sudo docker run \
--volume /var/lib/chronograf:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
6 changes: 3 additions & 3 deletions metrics/metrics-internal/chronograf_8889_internal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
cd "$(dirname "$0")"

if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"

Expand Down Expand Up @@ -43,7 +43,7 @@ sudo docker run \
--env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \
--env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \
--env PUBLIC_URL=https://internal-metrics.solana.com:8889 \
--env TOKEN_SECRET= \
--env TOKEN_SECRET="$TOKEN_SECRET" \
--env inactivity-duration=48h \
--name=chronograf_8889_internal \
--net=influxdb \
Expand All @@ -53,4 +53,4 @@ sudo docker run \
--volume /var/lib/chronograf_8889:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
8 changes: 6 additions & 2 deletions metrics/metrics-internal/grafana_internal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
#
# (Re)starts the Grafana containers
#

cd "$(dirname "$0")"

if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"

Expand All @@ -32,7 +33,6 @@ sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/
sudo chmod 0444 certs/*
sudo chown buildkite-agent:buildkite-agent certs


#(Re)start the container
sudo docker run \
--detach \
Expand All @@ -41,6 +41,10 @@ sudo docker run \
--publish 3000:3000 \
--user root:root \
--env GF_PATHS_CONFIG=/grafana.ini \
--env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \
--env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \
--env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \
--env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \
--volume "$PWD"/certs:/certs:ro \
--volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \
--volume /var/lib/grafana:/var/lib/grafana \
Expand Down
4 changes: 3 additions & 1 deletion metrics/metrics-internal/influxdb_internal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
cd "$(dirname "$0")"

if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"

Expand Down Expand Up @@ -39,6 +39,8 @@ sudo docker run \
--net=influxdb \
--publish 8086:8086 \
--user "$(id -u):$(id -g)" \
--env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \
--env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \
--volume "$PWD"/certs:/certs \
--volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \
--volume /var/lib/influxdb:/var/lib/influxdb \
Expand Down
14 changes: 10 additions & 4 deletions metrics/metrics-internal/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
cd "$(dirname "$0")"

if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"

Expand Down Expand Up @@ -48,6 +48,10 @@ sudo docker run \
--publish 3000:3000 \
--user root:root \
--env GF_PATHS_CONFIG=/grafana.ini \
--env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \
--env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \
--env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \
--env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \
--volume "$PWD"/certs:/certs:ro \
--volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \
--volume /var/lib/grafana:/var/lib/grafana \
Expand All @@ -61,6 +65,8 @@ sudo docker run \
--net=influxdb \
--publish 8086:8086 \
--user "$(id -u):$(id -g)" \
--env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \
--env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \
--volume "$PWD"/certs:/certs \
--volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \
--volume /var/lib/influxdb:/var/lib/influxdb \
Expand Down Expand Up @@ -90,9 +96,9 @@ sudo docker run \
--volume /var/lib/chronograf_8889:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"

sudo sudo docker run \
sudo docker run \
--detach \
--env AUTH_DURATION=24h \
--env TLS_CERTIFICATE=/certs/fullchain.pem \
Expand All @@ -111,7 +117,7 @@ sudo sudo docker run \
--volume /var/lib/chronograf:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"

curl -h | sed -ne '/--tlsv/p'
curl --retry 10 --retry-delay 5 -v --head https://"$HOST":8086/ping
Expand Down
70 changes: 42 additions & 28 deletions metrics/metrics-internal/status.sh
Original file line number Diff line number Diff line change
@@ -1,33 +1,47 @@
#!/bin/bash -ex
#
# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers
#
cd "$(dirname "$0")"
#!/bin/bash

cd "$(dirname "$0")" || exit

if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"

echo +++ status
(
set -x
pwd
sudo docker ps --no-trunc --size
sudo du -hs /var/lib/{influxdb,chronograf,grafana}
df -h
free -h
uptime
)

# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again

for container in influxdb_internal chronograf_8888_internal chronograf_8889_internal grafana_internal; do
if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then
curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in metrics-internal server"}' "$SLACK_WEBHOOK"
curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in metrics-internal server"}' "$DISCORD_WEBHOOK"
echo "Starting up script"
sudo bash $container.sh
sleep 30
fi
done
# List of containers
containers=("influxdb_internal" "chronograf_8889_internal" "chronograf_8888_internal" "grafana_internal")

# Send a message to Discord
send_discord_message() {
local message="$1"
curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK"
}

# Send a critical alert to PagerDuty
send_pagerduty_alert() {
local description="$1"
curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK"
}

# Iterate over the containers and check their status
for container in "${containers[@]}"; do
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)

if [ "$container_status" != "running" ]; then
send_discord_message "$container is down and it's being redeployed..."

# Run the container.sh script to redeploy the container
chmod +x "$container.sh"
./"$container.sh"
sleep 10

# Check the container status again
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)

if [ "$container_status" != "running" ]; then
send_discord_message "$container failed to redeploy and manual intervention is required"
send_pagerduty_alert "$container failed to redeploy and manual intervention is required."
else
send_discord_message "$container has been redeployed successfully"
fi
fi
done
2 changes: 1 addition & 1 deletion metrics/metrics-main/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ sudo docker run \
--volume /var/lib/chronograf:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file=5 \
$CHRONOGRAF_IMAGE --influxdb-url=https://metrics.solana.com:8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086

sudo docker run \
--detach \
Expand Down
7 changes: 7 additions & 0 deletions metrics/metrics-main/status.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
#!/bin/bash

cd "$(dirname "$0")" || exit

if [[ -z $HOST ]]; then
HOST=metrics.solana.com
fi
echo "HOST: $HOST"

# List of containers
containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor")

Expand Down

0 comments on commit 80b2572

Please sign in to comment.