From ac03408b0160798ea34c910d3c6e47c408f0fd40 Mon Sep 17 00:00:00 2001 From: Philip Jenvey Date: Wed, 27 Mar 2024 12:53:14 -0700 Subject: [PATCH] feat: emit pool timeouts/conditional failures as metrics (not sentry) and refactor BigtableClientManager::Error to use BigTableError Closes: SYNC-4190 --- .../autoconnect-settings/src/app_state.rs | 45 ++++++++-------- autoendpoint/src/error.rs | 33 +++++++----- autoendpoint/src/routes/registration.rs | 4 +- .../src/db/bigtable/bigtable_client/error.rs | 36 ++++++++----- .../src/db/bigtable/bigtable_client/mod.rs | 12 +++-- autopush-common/src/db/bigtable/pool.rs | 52 +++++++------------ autopush-common/src/db/error.rs | 6 +-- autopush-common/src/errors.rs | 10 ---- 8 files changed, 95 insertions(+), 103 deletions(-) diff --git a/autoconnect/autoconnect-settings/src/app_state.rs b/autoconnect/autoconnect-settings/src/app_state.rs index 67c902f15..5ee0d7cb4 100644 --- a/autoconnect/autoconnect-settings/src/app_state.rs +++ b/autoconnect/autoconnect-settings/src/app_state.rs @@ -7,6 +7,7 @@ use autopush_common::db::dual::DualClientImpl; #[cfg(feature = "dynamodb")] use autopush_common::db::dynamodb::DdbClientImpl; use cadence::StatsdClient; +use config::ConfigError; use fernet::{Fernet, MultiFernet}; use tokio::sync::RwLock; @@ -15,7 +16,6 @@ use autoconnect_common::{ registry::ClientRegistry, }; use autopush_common::db::{client::DbClient, DbSettings, StorageType}; -use autopush_common::errors::{ApcErrorKind, Result}; use crate::{Settings, ENV_PREFIX}; @@ -39,32 +39,27 @@ pub struct AppState { } impl AppState { - pub fn from_settings(settings: Settings) -> Result { + pub fn from_settings(settings: Settings) -> Result { let crypto_key = &settings.crypto_key; if !(crypto_key.starts_with('[') && crypto_key.ends_with(']')) { - return Err( - ApcErrorKind::ConfigError(config::ConfigError::Message(format!( - "Invalid {}_CRYPTO_KEY", - ENV_PREFIX - ))) - .into(), - ); + return Err(ConfigError::Message(format!( + "Invalid {ENV_PREFIX}_CRYPTO_KEY" + ))); } let crypto_key = &crypto_key[1..crypto_key.len() - 1]; debug!("🔐 Fernet keys: {:?}", &crypto_key); let fernets: Vec = crypto_key .split(',') .map(|s| s.trim().to_string()) - .map(|key| { - Fernet::new(&key).unwrap_or_else(|| panic!("Invalid {}_CRYPTO_KEY", ENV_PREFIX)) - }) + .map(|key| Fernet::new(&key)._or_else(|| panic!("Invalid {ENV_PREFIX}_CRYPTO_KEY"))) .collect(); let fernet = MultiFernet::new(fernets); let metrics = autopush_common::metrics::builder( &settings.statsd_label, &settings.statsd_host, settings.statsd_port, - )? + ) + .map_err(|e| ConfigError::Message(e.to_string()))? // Temporary tag to distinguish from the legacy autopush(connect) .with_tag("autoconnect", "true") .build(); @@ -78,16 +73,21 @@ impl AppState { #[allow(unused)] let db: Box = match storage_type { #[cfg(feature = "dynamodb")] - StorageType::DynamoDb => Box::new(DdbClientImpl::new(metrics.clone(), &db_settings)?), + StorageType::DynamoDb => Box::new( + DdbClientImpl::new(metrics.clone(), &db_settings) + .map_err(|e| ConfigError::Message(e.to_string()))?, + ), #[cfg(feature = "bigtable")] StorageType::BigTable => { - let client = BigTableClientImpl::new(metrics.clone(), &db_settings)?; + let client = BigTableClientImpl::new(metrics.clone(), &db_settings) + .map_err(|e| ConfigError::Message(e.to_string()))?; client.spawn_sweeper(Duration::from_secs(30)); Box::new(client) } #[cfg(all(feature = "bigtable", feature = "dynamodb"))] StorageType::Dual => { - let client = DualClientImpl::new(metrics.clone(), &db_settings)?; + let client = DualClientImpl::new(metrics.clone(), &db_settings) + .map_err(|e| ConfigError::Message(e.to_string()))?; client.spawn_sweeper(Duration::from_secs(30)); Box::new(client) } @@ -122,17 +122,14 @@ impl AppState { /// Initialize the `BroadcastChangeTracker` /// /// Via `autoconnect_common::megaphone::init_and_spawn_megaphone_updater` - pub async fn init_and_spawn_megaphone_updater(&self) -> Result<()> { + pub async fn init_and_spawn_megaphone_updater(&self) -> Result<(), ConfigError> { let Some(ref url) = self.settings.megaphone_api_url else { return Ok(()); }; let Some(ref token) = self.settings.megaphone_api_token else { - return Err( - ApcErrorKind::ConfigError(config::ConfigError::Message(format!( - "{ENV_PREFIX}__MEGAPHONE_API_URL requires {ENV_PREFIX}__MEGAPHONE_API_TOKEN" - ))) - .into(), - ); + return Err(ConfigError::Message(format!( + "{ENV_PREFIX}__MEGAPHONE_API_URL requires {ENV_PREFIX}__MEGAPHONE_API_TOKEN" + ))); }; init_and_spawn_megaphone_updater( &self.broadcaster, @@ -143,7 +140,7 @@ impl AppState { self.settings.megaphone_poll_interval, ) .await - .map_err(|e| ApcErrorKind::GeneralError(format!("{}", e)))?; + .map_err(|e| ConfigError::Message(e.to_string()))?; Ok(()) } } diff --git a/autoendpoint/src/error.rs b/autoendpoint/src/error.rs index 54350bb61..039462495 100644 --- a/autoendpoint/src/error.rs +++ b/autoendpoint/src/error.rs @@ -1,5 +1,4 @@ //! Error types and transformations -// TODO: Collpase these into `autopush_common::error` use crate::headers::vapid::VapidError; use crate::routers::RouterError; @@ -82,6 +81,9 @@ pub enum ApiErrorKind { #[error("Database error: {0}")] Database(#[from] DbError), + #[error("Conditional database operation failed: {0}")] + Conditional(String), + #[error("Invalid token")] InvalidToken, @@ -150,7 +152,9 @@ impl ApiErrorKind { ApiErrorKind::LogCheck => StatusCode::IM_A_TEAPOT, - ApiErrorKind::Database(DbError::Backoff(_)) => StatusCode::SERVICE_UNAVAILABLE, + ApiErrorKind::Database(DbError::Backoff(_)) | ApiErrorKind::Conditional(_) => { + StatusCode::SERVICE_UNAVAILABLE + } ApiErrorKind::General(_) | ApiErrorKind::Io(_) @@ -192,6 +196,7 @@ impl ApiErrorKind { ApiErrorKind::Io(_) => "io", ApiErrorKind::Metrics(_) => "metrics", ApiErrorKind::Database(_) => "database", + ApiErrorKind::Conditional(_) => "conditional", ApiErrorKind::EndpointUrl(_) => "endpoint_url", ApiErrorKind::RegistrationSecretHash(_) => "registration_secret_hash", }) @@ -202,22 +207,21 @@ impl ApiErrorKind { match self { // ignore selected validation errors. ApiErrorKind::Router(e) => e.is_sentry_event(), - _ => !matches!( - self, - // Ignore common webpush errors - ApiErrorKind::NoTTL | ApiErrorKind::InvalidEncryption(_) | - // Ignore common VAPID erros - ApiErrorKind::VapidError(_) + // Ignore common webpush errors + ApiErrorKind::NoTTL | ApiErrorKind::InvalidEncryption(_) | + // Ignore common VAPID erros + ApiErrorKind::VapidError(_) | ApiErrorKind::Jwt(_) | ApiErrorKind::TokenHashValidation(_) | ApiErrorKind::InvalidAuthentication | ApiErrorKind::InvalidLocalAuth(_) | - // Ignore missing or invalid user errors - ApiErrorKind::NoUser | ApiErrorKind::NoSubscription | - // Ignore oversized payload. - ApiErrorKind::PayloadError(_) | - ApiErrorKind::Validation(_), - ), + // Ignore missing or invalid user errors + ApiErrorKind::NoUser | ApiErrorKind::NoSubscription | + // Ignore oversized payload. + ApiErrorKind::PayloadError(_) | + ApiErrorKind::Validation(_) | + ApiErrorKind::Conditional(_) => false, + _ => true, } } @@ -259,6 +263,7 @@ impl ApiErrorKind { | ApiErrorKind::Io(_) | ApiErrorKind::Metrics(_) | ApiErrorKind::Database(_) + | ApiErrorKind::Conditional(_) | ApiErrorKind::PayloadError(_) | ApiErrorKind::InvalidRouterToken | ApiErrorKind::RegistrationSecretHash(_) diff --git a/autoendpoint/src/routes/registration.rs b/autoendpoint/src/routes/registration.rs index fc0c2c3c1..a7edb90c7 100644 --- a/autoendpoint/src/routes/registration.rs +++ b/autoendpoint/src/routes/registration.rs @@ -116,8 +116,8 @@ pub async fn update_token_route( trace!("🌍 Updating user with UAID {uaid}"); trace!("🌍 user = {user:?}"); if !app_state.db.update_user(&mut user).await? { - // Unlikely to occur on mobile records - return Err(ApiErrorKind::General("Conditional update failed".to_owned()).into()); + // Occurs occasionally on mobile records + return Err(ApiErrorKind::Conditional("update_user".to_owned()).into()); } trace!("🌍 Finished updating token for UAID {uaid}"); diff --git a/autopush-common/src/db/bigtable/bigtable_client/error.rs b/autopush-common/src/db/bigtable/bigtable_client/error.rs index 48f0689b6..16f70536d 100644 --- a/autopush-common/src/db/bigtable/bigtable_client/error.rs +++ b/autopush-common/src/db/bigtable/bigtable_client/error.rs @@ -1,6 +1,7 @@ use std::fmt::{self, Display}; use backtrace::Backtrace; +use deadpool::managed::{PoolError, TimeoutType}; use thiserror::Error; use crate::errors::ReportableError; @@ -109,12 +110,13 @@ pub enum BigTableError { #[error("BigTable Admin Error: {0}")] Admin(String, Option), - #[error("Bigtable Recycle request")] - Recycle, - - /// General Pool builder errors. + /// General Pool errors #[error("Pool Error: {0}")] - Pool(String), + Pool(Box>), + + /// Timeout occurred while getting a pooled connection + #[error("Pool Timeout: {0:?}")] + PoolTimeout(TimeoutType), #[error("BigTable config error: {0}")] Config(String), @@ -130,13 +132,15 @@ impl ReportableError for BigTableError { } fn is_sentry_event(&self) -> bool { - // eventually, only capture important errors - //matches!(&self, BigTableError::Admin(_, _)) - true + #[allow(clippy::match_like_matches_macro)] + match self { + BigTableError::PoolTimeout(_) => false, + _ => true, + } } fn metric_label(&self) -> Option<&'static str> { - let err = match &self { + let err = match self { BigTableError::InvalidRowResponse(_) => "storage.bigtable.error.invalid_row_response", BigTableError::InvalidChunk(_) => "storage.bigtable.error.invalid_chunk", BigTableError::Read(_) => "storage.bigtable.error.read", @@ -144,16 +148,24 @@ impl ReportableError for BigTableError { BigTableError::Status(_, _) => "storage.bigtable.error.status", BigTableError::WriteTime(_) => "storage.bigtable.error.writetime", BigTableError::Admin(_, _) => "storage.bigtable.error.admin", - BigTableError::Recycle => "storage.bigtable.error.recycle", BigTableError::Pool(_) => "storage.bigtable.error.pool", + BigTableError::PoolTimeout(_) => "storage.bigtable.error.pool_timeout", BigTableError::GRPC(_) => "storage.bigtable.error.grpc", BigTableError::Config(_) => "storage.bigtable.error.config", }; Some(err) } + fn tags(&self) -> Vec<(&str, String)> { + #[allow(clippy::match_like_matches_macro)] + match self { + BigTableError::PoolTimeout(tt) => vec![("type", format!("{tt:?}").to_lowercase())], + _ => vec![], + } + } + fn extras(&self) -> Vec<(&str, String)> { - match &self { + match self { BigTableError::InvalidRowResponse(s) => vec![("error", s.to_string())], BigTableError::InvalidChunk(s) => vec![("error", s.to_string())], BigTableError::GRPC(s) => vec![("error", s.to_string())], @@ -170,7 +182,7 @@ impl ReportableError for BigTableError { }; x } - BigTableError::Pool(s) => vec![("error", s.to_owned())], + BigTableError::Pool(e) => vec![("error", e.to_string())], _ => vec![], } } diff --git a/autopush-common/src/db/bigtable/bigtable_client/mod.rs b/autopush-common/src/db/bigtable/bigtable_client/mod.rs index db76922e4..32296a21f 100644 --- a/autopush-common/src/db/bigtable/bigtable_client/mod.rs +++ b/autopush-common/src/db/bigtable/bigtable_client/mod.rs @@ -779,7 +779,10 @@ impl BigtableDb { /// stack. /// /// - pub async fn health_check(&mut self, metrics: Arc) -> DbResult { + pub async fn health_check( + &mut self, + metrics: Arc, + ) -> Result { // It is recommended that we pick a random key to perform the health check. Selecting // a single key for all health checks causes a "hot tablet" to arise. The `PingAndWarm` // is intended to be used prior to large updates and is not recommended for use in @@ -800,7 +803,7 @@ impl BigtableDb { retryable_error(metrics.clone()), ) .await - .map_err(|e| DbError::General(format!("BigTable connectivity error: {:?}", e)))?; + .map_err(error::BigTableError::Read)?; debug!("🉑 health check"); Ok(true) @@ -1318,11 +1321,12 @@ impl DbClient for BigTableClientImpl { } async fn health_check(&self) -> DbResult { - self.pool + Ok(self + .pool .get() .await? .health_check(self.metrics.clone()) - .await + .await?) } /// Returns true, because there's only one table in BigTable. We divide things up diff --git a/autopush-common/src/db/bigtable/pool.rs b/autopush-common/src/db/bigtable/pool.rs index 96715a192..8aa993706 100644 --- a/autopush-common/src/db/bigtable/pool.rs +++ b/autopush-common/src/db/bigtable/pool.rs @@ -7,15 +7,13 @@ use std::{ use actix_web::rt; use async_trait::async_trait; use cadence::StatsdClient; -use deadpool::managed::{Manager, PoolConfig, QueueMode, Timeouts}; +use deadpool::managed::{Manager, PoolConfig, PoolError, QueueMode, RecycleError, Timeouts}; use grpcio::{Channel, ChannelBuilder, ChannelCredentials, EnvBuilder}; use crate::db::bigtable::{bigtable_client::BigtableDb, BigTableDbSettings, BigTableError}; use crate::db::error::{DbError, DbResult}; use crate::db::DbSettings; -use super::bigtable_client::error; - const MAX_MESSAGE_LEN: i32 = 1 << 28; // 268,435,456 bytes const DEFAULT_GRPC_PORT: u16 = 443; @@ -43,12 +41,12 @@ impl BigTablePool { /// Get a new managed object from the pool. pub async fn get( &self, - ) -> Result, error::BigTableError> { - let obj = self - .pool - .get() - .await - .map_err(|e| error::BigTableError::Pool(e.to_string()))?; + ) -> Result, BigTableError> { + let obj = self.pool.get().await.map_err(|e| match e { + PoolError::Timeout(tt) => BigTableError::PoolTimeout(tt), + PoolError::Backend(e) => e, + e => BigTableError::Pool(Box::new(e)), + })?; debug!("🉑 Got db from pool"); Ok(obj) } @@ -118,7 +116,7 @@ impl BigTablePool { .config(config) .runtime(deadpool::Runtime::Tokio1) .build() - .map_err(|e| DbError::BTError(BigTableError::Pool(e.to_string())))?; + .map_err(|e| DbError::BTError(BigTableError::Config(e.to_string())))?; Ok(Self { pool, @@ -159,7 +157,7 @@ impl BigtableClientManager { dsn: Option, connection: String, metrics: Arc, - ) -> Result { + ) -> Result { Ok(Self { settings: settings.clone(), dsn, @@ -179,12 +177,12 @@ impl fmt::Debug for BigtableClientManager { #[async_trait] impl Manager for BigtableClientManager { - type Error = DbError; + type Error = BigTableError; type Type = BigtableDb; /// Create a new Bigtable Client with it's own channel. /// `BigtableClient` is the most atomic we can go. - async fn create(&self) -> Result { + async fn create(&self) -> Result { debug!("🏊 Create a new pool entry."); let entry = BigtableDb::new( self.get_channel()?, @@ -204,36 +202,27 @@ impl Manager for BigtableClientManager { if let Some(timeout) = self.settings.database_pool_connection_ttl { if Instant::now() - metrics.created > timeout { debug!("🏊 Recycle requested (old)."); - return Err(DbError::BTError(BigTableError::Recycle).into()); + return Err(RecycleError::Message("Connection too old".to_owned())); } } if let Some(timeout) = self.settings.database_pool_max_idle { if let Some(recycled) = metrics.recycled { if Instant::now() - recycled > timeout { debug!("🏊 Recycle requested (idle)."); - return Err(DbError::BTError(BigTableError::Recycle).into()); + return Err(RecycleError::Message("Connection too idle".to_owned())); } } } - // Clippy 0.1.73 complains about the `.map_err` being hard to read. - // note, this changes to `blocks_in_conditions` for 1.76+ - #[allow(clippy::blocks_in_conditions)] if !client .health_check(self.metrics.clone()) .await - .map_err(|e| { - debug!("🏊 Recycle requested (health). {:?}", e); - DbError::BTError(BigTableError::Recycle) - })? + .inspect_err(|e| debug!("🏊 Recycle requested (health). {:?}", e))? { debug!("🏊 Health check failed"); - return Err(DbError::BTError(BigTableError::Recycle).into()); + return Err(RecycleError::Message("Health check failed".to_owned())); } - // Bigtable does not offer a simple health check. A read or write operation would - // need to be performed. - Ok(()) } } @@ -256,14 +245,9 @@ impl BigtableClientManager { { debug!("🉑 Using emulator"); } else { - chan = chan.set_credentials(ChannelCredentials::google_default_credentials().map_err( - |e| { - BigTableError::Admin( - "Could not set credentials".to_owned(), - Some(e.to_string()), - ) - }, - )?); + chan = chan.set_credentials( + ChannelCredentials::google_default_credentials().map_err(BigTableError::GRPC)?, + ); debug!("🉑 Using real"); } Ok(chan) diff --git a/autopush-common/src/db/error.rs b/autopush-common/src/db/error.rs index 5fdd32ef0..a14331c23 100644 --- a/autopush-common/src/db/error.rs +++ b/autopush-common/src/db/error.rs @@ -61,14 +61,14 @@ pub enum DbError { TableStatusUnknown, #[cfg(feature = "bigtable")] - #[error("BigTable error {0}")] + #[error("BigTable error: {0}")] BTError(#[from] BigTableError), /* #[error("Postgres error")] PgError(#[from] PgError), */ - #[error("Connection failure {0}")] + #[error("Connection failure: {0}")] ConnectionError(String), #[error("The conditional request failed")] @@ -77,7 +77,7 @@ pub enum DbError { #[error("Database integrity error: {}", _0)] Integrity(String, Option), - #[error("Unknown Database Error {0}")] + #[error("Unknown Database Error: {0}")] General(String), // Return a 503 error diff --git a/autopush-common/src/errors.rs b/autopush-common/src/errors.rs index 7060f0840..5424fbb60 100644 --- a/autopush-common/src/errors.rs +++ b/autopush-common/src/errors.rs @@ -108,8 +108,6 @@ pub enum ApcErrorKind { ParseUrlError(#[from] url::ParseError), #[error(transparent)] ConfigError(#[from] config::ConfigError), - #[error(transparent)] - DbError(#[from] crate::db::error::DbError), #[error("Error while validating token")] TokenHashValidation(#[source] openssl::error::ErrorStack), #[error("Error while creating secret")] @@ -163,10 +161,6 @@ impl ApcErrorKind { Self::PongTimeout | Self::ExcessivePing => false, // Non-actionable Endpoint errors Self::PayloadError(_) => false, - #[cfg(feature = "bigtable")] - Self::DbError(crate::db::error::DbError::BTError( - crate::db::bigtable::BigTableError::Recycle, - )) => false, _ => true, } } @@ -177,10 +171,6 @@ impl ApcErrorKind { Self::PongTimeout => "pong_timeout", Self::ExcessivePing => "excessive_ping", Self::PayloadError(_) => "payload", - #[cfg(feature = "bigtable")] - Self::DbError(crate::db::error::DbError::BTError( - crate::db::bigtable::BigTableError::Recycle, - )) => "bt_recycle", _ => return None, }; Some(label)