From 80b25726e1087c108ca5651450f2e795f808a27c Mon Sep 17 00:00:00 2001 From: joeaba <77398477+joeaba@users.noreply.github.com> Date: Tue, 4 Apr 2023 09:03:57 -0500 Subject: [PATCH] update metrics status scripts (#31037) * update metrics status scripts * add exit condition --- .../chronograf_8888_internal.sh | 6 +- .../chronograf_8889_internal.sh | 6 +- metrics/metrics-internal/grafana_internal.sh | 8 ++- metrics/metrics-internal/influxdb_internal.sh | 4 +- metrics/metrics-internal/start.sh | 14 ++-- metrics/metrics-internal/status.sh | 70 +++++++++++-------- metrics/metrics-main/start.sh | 2 +- metrics/metrics-main/status.sh | 7 ++ 8 files changed, 75 insertions(+), 42 deletions(-) diff --git a/metrics/metrics-internal/chronograf_8888_internal.sh b/metrics/metrics-internal/chronograf_8888_internal.sh index 9d85fc456ab2fb..907524720e8695 100644 --- a/metrics/metrics-internal/chronograf_8888_internal.sh +++ b/metrics/metrics-internal/chronograf_8888_internal.sh @@ -5,7 +5,7 @@ cd "$(dirname "$0")" if [[ -z $HOST ]]; then - HOST=metrics.solana.com + HOST=internal-metrics.solana.com fi echo "HOST: $HOST" @@ -34,7 +34,7 @@ sudo chown buildkite-agent:buildkite-agent certs # (Re)start the container -sudo sudo docker run \ +sudo docker run \ --detach \ --env AUTH_DURATION=24h \ --env TLS_CERTIFICATE=/certs/fullchain.pem \ @@ -53,4 +53,4 @@ sudo sudo docker run \ --volume /var/lib/chronograf:/var/lib/chronograf \ --log-opt max-size=1g \ --log-opt max-file="5" \ - $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD" diff --git a/metrics/metrics-internal/chronograf_8889_internal.sh b/metrics/metrics-internal/chronograf_8889_internal.sh index 9c48fff6d4d0a4..41d7fc6456d67c 100644 --- a/metrics/metrics-internal/chronograf_8889_internal.sh +++ b/metrics/metrics-internal/chronograf_8889_internal.sh @@ -5,7 +5,7 @@ cd "$(dirname "$0")" if [[ -z $HOST ]]; then - HOST=metrics.solana.com + HOST=internal-metrics.solana.com fi echo "HOST: $HOST" @@ -43,7 +43,7 @@ sudo docker run \ --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \ --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \ --env PUBLIC_URL=https://internal-metrics.solana.com:8889 \ - --env TOKEN_SECRET= \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ --env inactivity-duration=48h \ --name=chronograf_8889_internal \ --net=influxdb \ @@ -53,4 +53,4 @@ sudo docker run \ --volume /var/lib/chronograf_8889:/var/lib/chronograf \ --log-opt max-size=1g \ --log-opt max-file="5" \ - $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD" diff --git a/metrics/metrics-internal/grafana_internal.sh b/metrics/metrics-internal/grafana_internal.sh index d83b902313ccc4..1675b69a814b49 100644 --- a/metrics/metrics-internal/grafana_internal.sh +++ b/metrics/metrics-internal/grafana_internal.sh @@ -2,10 +2,11 @@ # # (Re)starts the Grafana containers # + cd "$(dirname "$0")" if [[ -z $HOST ]]; then - HOST=metrics.solana.com + HOST=internal-metrics.solana.com fi echo "HOST: $HOST" @@ -32,7 +33,6 @@ sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ sudo chmod 0444 certs/* sudo chown buildkite-agent:buildkite-agent certs - #(Re)start the container sudo docker run \ --detach \ @@ -41,6 +41,10 @@ sudo docker run \ --publish 3000:3000 \ --user root:root \ --env GF_PATHS_CONFIG=/grafana.ini \ + --env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \ + --env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \ + --env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \ + --env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \ --volume "$PWD"/certs:/certs:ro \ --volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \ --volume /var/lib/grafana:/var/lib/grafana \ diff --git a/metrics/metrics-internal/influxdb_internal.sh b/metrics/metrics-internal/influxdb_internal.sh index 6c4ec17e433b6b..44d81658813875 100644 --- a/metrics/metrics-internal/influxdb_internal.sh +++ b/metrics/metrics-internal/influxdb_internal.sh @@ -5,7 +5,7 @@ cd "$(dirname "$0")" if [[ -z $HOST ]]; then - HOST=metrics.solana.com + HOST=internal-metrics.solana.com fi echo "HOST: $HOST" @@ -39,6 +39,8 @@ sudo docker run \ --net=influxdb \ --publish 8086:8086 \ --user "$(id -u):$(id -g)" \ + --env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \ + --env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \ --volume "$PWD"/certs:/certs \ --volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \ --volume /var/lib/influxdb:/var/lib/influxdb \ diff --git a/metrics/metrics-internal/start.sh b/metrics/metrics-internal/start.sh index d331a0fddad37f..13b4831ef5248a 100644 --- a/metrics/metrics-internal/start.sh +++ b/metrics/metrics-internal/start.sh @@ -6,7 +6,7 @@ cd "$(dirname "$0")" if [[ -z $HOST ]]; then - HOST=metrics.solana.com + HOST=internal-metrics.solana.com fi echo "HOST: $HOST" @@ -48,6 +48,10 @@ sudo docker run \ --publish 3000:3000 \ --user root:root \ --env GF_PATHS_CONFIG=/grafana.ini \ + --env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \ + --env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \ + --env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \ + --env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \ --volume "$PWD"/certs:/certs:ro \ --volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \ --volume /var/lib/grafana:/var/lib/grafana \ @@ -61,6 +65,8 @@ sudo docker run \ --net=influxdb \ --publish 8086:8086 \ --user "$(id -u):$(id -g)" \ + --env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \ + --env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \ --volume "$PWD"/certs:/certs \ --volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \ --volume /var/lib/influxdb:/var/lib/influxdb \ @@ -90,9 +96,9 @@ sudo docker run \ --volume /var/lib/chronograf_8889:/var/lib/chronograf \ --log-opt max-size=1g \ --log-opt max-file="5" \ - $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD" -sudo sudo docker run \ +sudo docker run \ --detach \ --env AUTH_DURATION=24h \ --env TLS_CERTIFICATE=/certs/fullchain.pem \ @@ -111,7 +117,7 @@ sudo sudo docker run \ --volume /var/lib/chronograf:/var/lib/chronograf \ --log-opt max-size=1g \ --log-opt max-file="5" \ - $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD" curl -h | sed -ne '/--tlsv/p' curl --retry 10 --retry-delay 5 -v --head https://"$HOST":8086/ping diff --git a/metrics/metrics-internal/status.sh b/metrics/metrics-internal/status.sh index beb57a12a2d76b..902439dacc5785 100644 --- a/metrics/metrics-internal/status.sh +++ b/metrics/metrics-internal/status.sh @@ -1,33 +1,47 @@ -#!/bin/bash -ex -# -# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers -# -cd "$(dirname "$0")" +#!/bin/bash + +cd "$(dirname "$0")" || exit if [[ -z $HOST ]]; then - HOST=metrics.solana.com + HOST=internal-metrics.solana.com fi echo "HOST: $HOST" -echo +++ status -( - set -x - pwd - sudo docker ps --no-trunc --size - sudo du -hs /var/lib/{influxdb,chronograf,grafana} - df -h - free -h - uptime -) - -# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again - -for container in influxdb_internal chronograf_8888_internal chronograf_8889_internal grafana_internal; do - if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then - curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in metrics-internal server"}' "$SLACK_WEBHOOK" - curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in metrics-internal server"}' "$DISCORD_WEBHOOK" - echo "Starting up script" - sudo bash $container.sh - sleep 30 - fi - done +# List of containers +containers=("influxdb_internal" "chronograf_8889_internal" "chronograf_8888_internal" "grafana_internal") + +# Send a message to Discord +send_discord_message() { + local message="$1" + curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK" +} + +# Send a critical alert to PagerDuty +send_pagerduty_alert() { + local description="$1" + curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK" +} + +# Iterate over the containers and check their status +for container in "${containers[@]}"; do + container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null) + + if [ "$container_status" != "running" ]; then + send_discord_message "$container is down and it's being redeployed..." + + # Run the container.sh script to redeploy the container + chmod +x "$container.sh" + ./"$container.sh" + sleep 10 + + # Check the container status again + container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null) + + if [ "$container_status" != "running" ]; then + send_discord_message "$container failed to redeploy and manual intervention is required" + send_pagerduty_alert "$container failed to redeploy and manual intervention is required." + else + send_discord_message "$container has been redeployed successfully" + fi + fi +done diff --git a/metrics/metrics-main/start.sh b/metrics/metrics-main/start.sh index 0a3a969a1b620e..5b7bbac309217a 100755 --- a/metrics/metrics-main/start.sh +++ b/metrics/metrics-main/start.sh @@ -132,7 +132,7 @@ sudo docker run \ --volume /var/lib/chronograf:/var/lib/chronograf \ --log-opt max-size=1g \ --log-opt max-file=5 \ - $CHRONOGRAF_IMAGE --influxdb-url=https://metrics.solana.com:8086 + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 sudo docker run \ --detach \ diff --git a/metrics/metrics-main/status.sh b/metrics/metrics-main/status.sh index e1019e903a14aa..b39de16c68e2f0 100755 --- a/metrics/metrics-main/status.sh +++ b/metrics/metrics-main/status.sh @@ -1,5 +1,12 @@ #!/bin/bash +cd "$(dirname "$0")" || exit + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + # List of containers containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor")