From 2861132922dd9231ebef2431b35871ba1a9ecd66 Mon Sep 17 00:00:00 2001 From: Mark Drobnak Date: Mon, 13 Jul 2020 11:07:53 -0400 Subject: [PATCH] Check database health in `/health` Also changed the `/__heartbeat__` handler from `/status` to `/health`. --- autoendpoint/src/db/client.rs | 46 +++++++++++++++++++++++++++---- autoendpoint/src/db/error.rs | 10 ++++++- autoendpoint/src/db/retry.rs | 20 +++++++++++++- autoendpoint/src/routes/health.rs | 36 ++++++++++++++++++++---- autoendpoint/src/server.rs | 2 +- 5 files changed, 100 insertions(+), 14 deletions(-) diff --git a/autoendpoint/src/db/client.rs b/autoendpoint/src/db/client.rs index 4222d834d..9b3cec50e 100644 --- a/autoendpoint/src/db/client.rs +++ b/autoendpoint/src/db/client.rs @@ -1,17 +1,17 @@ use crate::db::error::{DbError, DbResult}; use crate::db::retry::{ - retry_policy, retryable_delete_error, retryable_getitem_error, retryable_putitem_error, - retryable_updateitem_error, + retry_policy, retryable_delete_error, retryable_describe_table_error, retryable_getitem_error, + retryable_putitem_error, retryable_updateitem_error, }; use autopush_common::db::{DynamoDbNotification, DynamoDbUser}; use autopush_common::notification::Notification; use autopush_common::{ddb_item, hashmap, val}; use cadence::StatsdClient; use rusoto_core::credential::StaticProvider; -use rusoto_core::{HttpClient, Region}; +use rusoto_core::{HttpClient, Region, RusotoError}; use rusoto_dynamodb::{ - AttributeValue, DeleteItemInput, DynamoDb, DynamoDbClient, GetItemInput, PutItemInput, - UpdateItemInput, + AttributeValue, DeleteItemInput, DescribeTableError, DescribeTableInput, DynamoDb, + DynamoDbClient, GetItemInput, PutItemInput, UpdateItemInput, }; use std::collections::HashSet; use std::env; @@ -180,4 +180,40 @@ impl DbClient { Ok(()) } + + /// Check if the router table exists + pub async fn router_table_exists(&self) -> DbResult { + self.table_exists(self.router_table.clone()).await + } + + /// Check if the message table exists + pub async fn message_table_exists(&self) -> DbResult { + self.table_exists(self.message_table.clone()).await + } + + /// Check if a table exists + async fn table_exists(&self, table_name: String) -> DbResult { + let describe_item = DescribeTableInput { table_name }; + + let output = match retry_policy() + .retry_if( + || self.ddb.describe_table(describe_item.clone()), + retryable_describe_table_error(self.metrics.clone()), + ) + .await + { + Ok(output) => output, + Err(RusotoError::Service(DescribeTableError::ResourceNotFound(_))) => { + return Ok(false); + } + Err(e) => return Err(e.into()), + }; + + let status = output + .table + .and_then(|table| table.table_status) + .ok_or(DbError::TableStatusUnknown)?; + + Ok(["CREATING", "UPDATING", "ACTIVE"].contains(&status.as_str())) + } } diff --git a/autoendpoint/src/db/error.rs b/autoendpoint/src/db/error.rs index ddbfc5898..8ebf7848d 100644 --- a/autoendpoint/src/db/error.rs +++ b/autoendpoint/src/db/error.rs @@ -1,7 +1,9 @@ use thiserror::Error; use rusoto_core::RusotoError; -use rusoto_dynamodb::{DeleteItemError, GetItemError, PutItemError, UpdateItemError}; +use rusoto_dynamodb::{ + DeleteItemError, DescribeTableError, GetItemError, PutItemError, UpdateItemError, +}; pub type DbResult = Result; @@ -19,6 +21,12 @@ pub enum DbError { #[error("Database error while performing DeleteItem")] DeleteItem(#[from] RusotoError), + #[error("Database error while performing DescribeTable")] + DescribeTable(#[from] RusotoError), + #[error("Error while performing (de)serialization: {0}")] Serialization(#[from] serde_dynamodb::Error), + + #[error("Unable to determine table status")] + TableStatusUnknown, } diff --git a/autoendpoint/src/db/retry.rs b/autoendpoint/src/db/retry.rs index 974d755a7..3c7ec9bcc 100644 --- a/autoendpoint/src/db/retry.rs +++ b/autoendpoint/src/db/retry.rs @@ -1,7 +1,9 @@ use again::RetryPolicy; use cadence::{Counted, StatsdClient}; use rusoto_core::RusotoError; -use rusoto_dynamodb::{DeleteItemError, GetItemError, PutItemError, UpdateItemError}; +use rusoto_dynamodb::{ + DeleteItemError, DescribeTableError, GetItemError, PutItemError, UpdateItemError, +}; use std::time::Duration; /// Create a retry function for the given error @@ -28,6 +30,22 @@ retryable_error!(retryable_updateitem_error, UpdateItemError, "update_item"); retryable_error!(retryable_putitem_error, PutItemError, "put_item"); retryable_error!(retryable_delete_error, DeleteItemError, "delete_item"); +// DescribeTableError does not have a ProvisionedThroughputExceeded variant +pub fn retryable_describe_table_error( + metrics: StatsdClient, +) -> impl Fn(&RusotoError) -> bool { + move |err| match err { + RusotoError::Service(DescribeTableError::InternalServerError(_)) => { + metrics + .incr_with_tags("database.retry") + .with_tag("error", "describe_table_error") + .send(); + true + } + _ => false, + } +} + /// Build an exponential retry policy pub fn retry_policy() -> RetryPolicy { RetryPolicy::exponential(Duration::from_millis(100)) diff --git a/autoendpoint/src/routes/health.rs b/autoendpoint/src/routes/health.rs index 47c8447cf..830704f8d 100644 --- a/autoendpoint/src/routes/health.rs +++ b/autoendpoint/src/routes/health.rs @@ -1,18 +1,42 @@ //! Health and Dockerflow routes -use actix_web::web::Json; +use crate::db::error::DbResult; +use crate::server::ServerState; +use actix_web::web::{Data, Json}; use actix_web::HttpResponse; use serde_json::json; -/// Handle the `/health` route -pub async fn health_route() -> Json { - // TODO: Get database table health +/// Handle the `/health` and `/__heartbeat__` routes +pub async fn health_route(state: Data) -> Json { + let router_health = interpret_table_health(state.ddb.router_table_exists().await); + let message_health = interpret_table_health(state.ddb.message_table_exists().await); + Json(json!({ - "status": "OK" + "status": "OK", + "version": env!("CARGO_PKG_VERSION"), + "router_table": router_health, + "message_table": message_health })) } -/// Handle the `/status` and `/__heartbeat__` routes +/// Convert the result of a DB health check to JSON +fn interpret_table_health(health: DbResult) -> serde_json::Value { + match health { + Ok(true) => json!({ + "status": "OK" + }), + Ok(false) => json!({ + "status": "NOT OK", + "cause": "Nonexistent table" + }), + Err(e) => json!({ + "status": "NOT OK", + "cause": e.to_string() + }), + } +} + +/// Handle the `/status` route pub async fn status_route() -> Json { Json(json!({ "status": "OK", diff --git a/autoendpoint/src/server.rs b/autoendpoint/src/server.rs index a23fb6a7e..a5f30d088 100644 --- a/autoendpoint/src/server.rs +++ b/autoendpoint/src/server.rs @@ -71,7 +71,7 @@ impl Server { .service(web::resource("/status").route(web::get().to(status_route))) .service(web::resource("/health").route(web::get().to(health_route))) // Dockerflow - .service(web::resource("/__heartbeat__").route(web::get().to(status_route))) + .service(web::resource("/__heartbeat__").route(web::get().to(health_route))) .service(web::resource("/__lbheartbeat__").route(web::get().to(lb_heartbeat_route))) .service(web::resource("/__version__").route(web::get().to(version_route))) })