Skip to content

Commit

Permalink
Update README and (obsolete) gcs-retry.patch file
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-jiang committed Nov 1, 2024
1 parent 65621c3 commit bd4c8aa
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 13 deletions.
4 changes: 3 additions & 1 deletion patches/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ This directory contains the Vector binary that we use. We use a non-standard bin
We patch the GCS sink to be much more greedy in retrying, so that it pretty much retries anything.

## In This Directory
- `gcs-retry.path` -> A patch file which can be applied to the Vector main branch to introduce the retrying behavior we want
- `gcs-retry.patch` -> A patch file which can be applied to the Vector main branch to introduce the retrying behavior we want


The Dockerfiles will compile a version of Vector with our patches in them automatically. Check those files for the commit hash that we are based off of in the case you'd like to make some updates.

To update the patch, clone the vector repo, checkout the specified commit hash and make your changes. After making your changes, run `git diff > gcs-retry.patch` to save the diff and copy it into this directory. The build files will build vector with your patch automatically.

Update: as of Oct 23 2024 (commit 0eaefd6a1476b6e2b8d46d411dcbcb9eeda4e9c3 in the post-git-rewrite monorepo), we no longer use the `gcs-retry.patch` file in our Dockerfile to build the Vector image, as we've moved to a fork of the Vector repository. Instead, we checkout a specific commit from a branch on the forked Vector repo by default, with an env var that can override the branch to be checked out. See the `discord_data/vector_base/Dockerfile` file in the monorepo.

### Currently Patched
The following are patched:
- Fixing GCS Sink error type that allows proper retry handling
- Extremely generous retry logic that functionaly retries everything
- Backport updated GCP auth token handling from https://github.com/vectordotdev/vector/pull/20574
- Retry the GCS sink healthcheck, every 5 seconds, up to 3 times, stopping on the first success. Prints the healthcheck response (if not successful) to logs on each attempt for debugging
71 changes: 59 additions & 12 deletions patches/gcs-retry.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/src/gcp.rs b/src/gcp.rs
index bfc486f92..148fa9dec 100644
index bfc486f92..baa8e143d 100644

Check warning

Code scanning / check-spelling

Candidate Pattern Warning

Line matches candidate pattern "index (?:[0-9a-z]{7,40},|)[0-9a-z]{7,40}..[0-9a-z]{7,40}" (candidate-pattern)

Check failure

Code scanning / check-spelling

Unrecognized Spelling Error

bfc is not a recognized word. (unrecognized-spelling)
--- a/src/gcp.rs
+++ b/src/gcp.rs
@@ -16,7 +16,7 @@ use hyper::header::AUTHORIZATION;
Expand All @@ -10,18 +10,18 @@ index bfc486f92..148fa9dec 100644
+use tokio::sync::watch;
use vector_lib::configurable::configurable_component;
use vector_lib::sensitive_string::SensitiveString;

@@ -25,6 +25,11 @@ use crate::{config::ProxyConfig, http::HttpClient, http::HttpError};
const SERVICE_ACCOUNT_TOKEN_URL: &str =
"http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/token";

+// See https://cloud.google.com/compute/docs/access/authenticate-workloads#applications
+const METADATA_TOKEN_EXPIRY_MARGIN_SECS: u64 = 200;
+
+const METADATA_TOKEN_ERROR_RETRY_SECS: u64 = 2;
+
pub const PUBSUB_URL: &str = "https://pubsub.googleapis.com";

pub static PUBSUB_ADDRESS: Lazy<String> = Lazy::new(|| {
@@ -194,19 +199,25 @@ impl GcpAuthenticator {
async fn token_regenerator(self, sender: watch::Sender<()>) {
Expand Down Expand Up @@ -55,28 +55,75 @@ index bfc486f92..148fa9dec 100644
}
}
diff --git a/src/sinks/gcs_common/config.rs b/src/sinks/gcs_common/config.rs
index 914d780c8..e59a4e8e4 100644
index 914d780c8..41a657ab8 100644
--- a/src/sinks/gcs_common/config.rs
+++ b/src/sinks/gcs_common/config.rs
@@ -6,7 +6,7 @@ use vector_lib::configurable::configurable_component;

@@ -3,10 +3,11 @@ use http::{StatusCode, Uri};
use hyper::Body;
use snafu::Snafu;
use vector_lib::configurable::configurable_component;
+use tokio::time::{interval, Duration};

use crate::{
gcp::{GcpAuthenticator, GcpError},
- http::HttpClient,
+ http::{HttpClient, HttpError},
sinks::{
gcs_common::service::GcsResponse,
util::retries::{RetryAction, RetryLogic},
@@ -141,7 +141,7 @@ pub struct GcsRetryLogic;

@@ -111,14 +112,37 @@ pub fn build_healthcheck(
) -> crate::Result<Healthcheck> {
let healthcheck = async move {
let uri = base_url.parse::<Uri>()?;
- let mut request = http::Request::head(uri).body(Body::empty())?;
-
- auth.apply(&mut request);
+ let mut num_retries = 0;
+ let max_retries = 3;
+ // repeat healthcheck every 5 sec
+ let mut interval = interval(Duration::from_secs(5));
+ let mut num_failures = 0;

let not_found_error = GcsError::BucketNotFound { bucket }.into();

- let response = client.send(request).await?;
- healthcheck_response(response, not_found_error)
+ loop {
+ interval.tick().await;
+ let mut request = http::Request::head(uri.clone()).body(Body::empty())?;
+
+ auth.apply(&mut request);
+
+ let response = client.send(request).await?;
+ num_retries += 1;
+ if response.status().is_success() {
+ // the healthcheck passes on the first success
+ return healthcheck_response(response, not_found_error);
+ } else {
+ // debug the healthcheck response
+ warn!("healthcheck response was not successful! {:#?}", response);
+ num_failures += 1;
+ }
+
+ if num_retries >= max_retries {
+ info!("non-success healthcheck responses = {}", num_failures);
+ info!("total healthcheck attempts = {}", num_retries);
+ return healthcheck_response(response, not_found_error);
+ }
+ }
};

Ok(healthcheck.boxed())
@@ -141,7 +165,7 @@ pub struct GcsRetryLogic;

// This is a clone of HttpRetryLogic for the Body type, should get merged
impl RetryLogic for GcsRetryLogic {
- type Error = hyper::Error;
+ type Error = HttpError;
type Response = GcsResponse;

fn is_retriable_error(&self, _error: &Self::Error) -> bool {
@@ -159,7 +159,7 @@ impl RetryLogic for GcsRetryLogic {
@@ -159,7 +183,7 @@ impl RetryLogic for GcsRetryLogic {
}
_ if status.is_server_error() => RetryAction::Retry(status.to_string().into()),
_ if status.is_success() => RetryAction::Successful,
Expand All @@ -91,7 +138,7 @@ index 0904a67cb..e3fae07e0 100644
+++ b/src/sinks/util/http.rs
@@ -470,6 +470,7 @@ impl RetryLogic for HttpRetryLogic {
let status = response.status();

match status {
+ StatusCode::UNAUTHORIZED => RetryAction::Retry("unauthorized".into()),
StatusCode::TOO_MANY_REQUESTS => RetryAction::Retry("too many requests".into()),
Expand Down

0 comments on commit bd4c8aa

Please sign in to comment.