From 65621c34986f28fc426be92c7a2974ed25795d83 Mon Sep 17 00:00:00 2001 From: Alexander Jiang Date: Thu, 31 Oct 2024 21:30:18 +0000 Subject: [PATCH 1/2] add retry in GCS healthcheck, clean up the changes, print out healthcheck response to logs when the healthcheck attempts fail --- src/sinks/gcs_common/config.rs | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/sinks/gcs_common/config.rs b/src/sinks/gcs_common/config.rs index e59a4e8e41d31..41a657ab8397c 100644 --- a/src/sinks/gcs_common/config.rs +++ b/src/sinks/gcs_common/config.rs @@ -3,6 +3,7 @@ use http::{StatusCode, Uri}; use hyper::Body; use snafu::Snafu; use vector_lib::configurable::configurable_component; +use tokio::time::{interval, Duration}; use crate::{ gcp::{GcpAuthenticator, GcpError}, @@ -111,14 +112,37 @@ pub fn build_healthcheck( ) -> crate::Result { let healthcheck = async move { let uri = base_url.parse::()?; - let mut request = http::Request::head(uri).body(Body::empty())?; - - auth.apply(&mut request); + let mut num_retries = 0; + let max_retries = 3; + // repeat healthcheck every 5 sec + let mut interval = interval(Duration::from_secs(5)); + let mut num_failures = 0; let not_found_error = GcsError::BucketNotFound { bucket }.into(); - let response = client.send(request).await?; - healthcheck_response(response, not_found_error) + loop { + interval.tick().await; + let mut request = http::Request::head(uri.clone()).body(Body::empty())?; + + auth.apply(&mut request); + + let response = client.send(request).await?; + num_retries += 1; + if response.status().is_success() { + // the healthcheck passes on the first success + return healthcheck_response(response, not_found_error); + } else { + // debug the healthcheck response + warn!("healthcheck response was not successful! {:#?}", response); + num_failures += 1; + } + + if num_retries >= max_retries { + info!("non-success healthcheck responses = {}", num_failures); + info!("total healthcheck attempts = {}", num_retries); + return healthcheck_response(response, not_found_error); + } + } }; Ok(healthcheck.boxed()) From bd4c8aa0cfaadd9dbb6abf10410fe000dc86ef0d Mon Sep 17 00:00:00 2001 From: Alexander Jiang Date: Fri, 1 Nov 2024 22:34:47 +0000 Subject: [PATCH 2/2] Update README and (obsolete) gcs-retry.patch file --- patches/README.md | 4 ++- patches/gcs-retry.patch | 71 ++++++++++++++++++++++++++++++++++------- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/patches/README.md b/patches/README.md index 3bca14ecb7742..cfc2c6d2e9aac 100644 --- a/patches/README.md +++ b/patches/README.md @@ -3,16 +3,18 @@ This directory contains the Vector binary that we use. We use a non-standard bin We patch the GCS sink to be much more greedy in retrying, so that it pretty much retries anything. ## In This Directory -- `gcs-retry.path` -> A patch file which can be applied to the Vector main branch to introduce the retrying behavior we want +- `gcs-retry.patch` -> A patch file which can be applied to the Vector main branch to introduce the retrying behavior we want The Dockerfiles will compile a version of Vector with our patches in them automatically. Check those files for the commit hash that we are based off of in the case you'd like to make some updates. To update the patch, clone the vector repo, checkout the specified commit hash and make your changes. After making your changes, run `git diff > gcs-retry.patch` to save the diff and copy it into this directory. The build files will build vector with your patch automatically. +Update: as of Oct 23 2024 (commit 0eaefd6a1476b6e2b8d46d411dcbcb9eeda4e9c3 in the post-git-rewrite monorepo), we no longer use the `gcs-retry.patch` file in our Dockerfile to build the Vector image, as we've moved to a fork of the Vector repository. Instead, we checkout a specific commit from a branch on the forked Vector repo by default, with an env var that can override the branch to be checked out. See the `discord_data/vector_base/Dockerfile` file in the monorepo. ### Currently Patched The following are patched: - Fixing GCS Sink error type that allows proper retry handling - Extremely generous retry logic that functionaly retries everything - Backport updated GCP auth token handling from https://github.com/vectordotdev/vector/pull/20574 +- Retry the GCS sink healthcheck, every 5 seconds, up to 3 times, stopping on the first success. Prints the healthcheck response (if not successful) to logs on each attempt for debugging diff --git a/patches/gcs-retry.patch b/patches/gcs-retry.patch index 796386b482164..3bbf3f04cb808 100644 --- a/patches/gcs-retry.patch +++ b/patches/gcs-retry.patch @@ -1,5 +1,5 @@ diff --git a/src/gcp.rs b/src/gcp.rs -index bfc486f92..148fa9dec 100644 +index bfc486f92..baa8e143d 100644 --- a/src/gcp.rs +++ b/src/gcp.rs @@ -16,7 +16,7 @@ use hyper::header::AUTHORIZATION; @@ -10,18 +10,18 @@ index bfc486f92..148fa9dec 100644 +use tokio::sync::watch; use vector_lib::configurable::configurable_component; use vector_lib::sensitive_string::SensitiveString; - + @@ -25,6 +25,11 @@ use crate::{config::ProxyConfig, http::HttpClient, http::HttpError}; const SERVICE_ACCOUNT_TOKEN_URL: &str = "http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/token"; - + +// See https://cloud.google.com/compute/docs/access/authenticate-workloads#applications +const METADATA_TOKEN_EXPIRY_MARGIN_SECS: u64 = 200; + +const METADATA_TOKEN_ERROR_RETRY_SECS: u64 = 2; + pub const PUBSUB_URL: &str = "https://pubsub.googleapis.com"; - + pub static PUBSUB_ADDRESS: Lazy = Lazy::new(|| { @@ -194,19 +199,25 @@ impl GcpAuthenticator { async fn token_regenerator(self, sender: watch::Sender<()>) { @@ -55,11 +55,15 @@ index bfc486f92..148fa9dec 100644 } } diff --git a/src/sinks/gcs_common/config.rs b/src/sinks/gcs_common/config.rs -index 914d780c8..e59a4e8e4 100644 +index 914d780c8..41a657ab8 100644 --- a/src/sinks/gcs_common/config.rs +++ b/src/sinks/gcs_common/config.rs -@@ -6,7 +6,7 @@ use vector_lib::configurable::configurable_component; - +@@ -3,10 +3,11 @@ use http::{StatusCode, Uri}; + use hyper::Body; + use snafu::Snafu; + use vector_lib::configurable::configurable_component; ++use tokio::time::{interval, Duration}; + use crate::{ gcp::{GcpAuthenticator, GcpError}, - http::HttpClient, @@ -67,16 +71,59 @@ index 914d780c8..e59a4e8e4 100644 sinks::{ gcs_common::service::GcsResponse, util::retries::{RetryAction, RetryLogic}, -@@ -141,7 +141,7 @@ pub struct GcsRetryLogic; - +@@ -111,14 +112,37 @@ pub fn build_healthcheck( + ) -> crate::Result { + let healthcheck = async move { + let uri = base_url.parse::()?; +- let mut request = http::Request::head(uri).body(Body::empty())?; +- +- auth.apply(&mut request); ++ let mut num_retries = 0; ++ let max_retries = 3; ++ // repeat healthcheck every 5 sec ++ let mut interval = interval(Duration::from_secs(5)); ++ let mut num_failures = 0; + + let not_found_error = GcsError::BucketNotFound { bucket }.into(); + +- let response = client.send(request).await?; +- healthcheck_response(response, not_found_error) ++ loop { ++ interval.tick().await; ++ let mut request = http::Request::head(uri.clone()).body(Body::empty())?; ++ ++ auth.apply(&mut request); ++ ++ let response = client.send(request).await?; ++ num_retries += 1; ++ if response.status().is_success() { ++ // the healthcheck passes on the first success ++ return healthcheck_response(response, not_found_error); ++ } else { ++ // debug the healthcheck response ++ warn!("healthcheck response was not successful! {:#?}", response); ++ num_failures += 1; ++ } ++ ++ if num_retries >= max_retries { ++ info!("non-success healthcheck responses = {}", num_failures); ++ info!("total healthcheck attempts = {}", num_retries); ++ return healthcheck_response(response, not_found_error); ++ } ++ } + }; + + Ok(healthcheck.boxed()) +@@ -141,7 +165,7 @@ pub struct GcsRetryLogic; + // This is a clone of HttpRetryLogic for the Body type, should get merged impl RetryLogic for GcsRetryLogic { - type Error = hyper::Error; + type Error = HttpError; type Response = GcsResponse; - + fn is_retriable_error(&self, _error: &Self::Error) -> bool { -@@ -159,7 +159,7 @@ impl RetryLogic for GcsRetryLogic { +@@ -159,7 +183,7 @@ impl RetryLogic for GcsRetryLogic { } _ if status.is_server_error() => RetryAction::Retry(status.to_string().into()), _ if status.is_success() => RetryAction::Successful, @@ -91,7 +138,7 @@ index 0904a67cb..e3fae07e0 100644 +++ b/src/sinks/util/http.rs @@ -470,6 +470,7 @@ impl RetryLogic for HttpRetryLogic { let status = response.status(); - + match status { + StatusCode::UNAUTHORIZED => RetryAction::Retry("unauthorized".into()), StatusCode::TOO_MANY_REQUESTS => RetryAction::Retry("too many requests".into()),