Skip to content

Commit

Permalink
feat: Add metrics to try and analyze UpdateItem bug. (#344)
Browse files Browse the repository at this point in the history
* feat: Add metrics to try and analyze UpdateItem bug.

Includes cargo fix for 1.67 clippy strings.
see relevant changes in

autoendpoint::routers::webPush::WebPushRouter::route_notification

* Eat the UpdateItem error

Issue: SYNC-3574 SYNC-3401
  • Loading branch information
jrconlin authored Feb 3, 2023
1 parent 008e3e8 commit 7641d18
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 20 deletions.
27 changes: 25 additions & 2 deletions autoendpoint/src/db/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -356,12 +356,35 @@ impl DbClient for DbClientImpl {
..Default::default()
};

retry_policy()
if let Err(err) = retry_policy()
.retry_if(
|| self.ddb.update_item(input.clone()),
retryable_updateitem_error(self.metrics.clone()),
)
.await?;
.await
{
/*
UpdateItem errors frequently occur during a redeploy due to a variety of reasons
(e.g. since a redeploy forces UAs to disconnect and changes the router host name,
the associated node_id will be different for legitimate reasons.) We should eat
these errors because they're not actionable and part of the normal run process.
*/

if let rusoto_core::RusotoError::Service(
rusoto_dynamodb::UpdateItemError::ConditionalCheckFailed(_),
) = err
{
// Most likely, the node_id recorded in the db does not match the
// node_id contained in the notification
self.metrics
.incr_with_tags("error.node.update")
.with_tag("node_id", &node_id)
.try_send()
.ok();
return Ok(());
}
return Err(DbError::from(err));
};

Ok(())
}
Expand Down
17 changes: 8 additions & 9 deletions autoendpoint/src/routers/webpush.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::db::client::DbClient;
use crate::error::{ApiError, ApiErrorKind, ApiResult};
use crate::error::{ApiErrorKind, ApiResult};
use crate::extractors::notification::Notification;
use crate::extractors::router_data_input::RouterDataInput;
use crate::routers::{Router, RouterError, RouterResponse};
Expand Down Expand Up @@ -38,6 +38,7 @@ impl Router for WebPushRouter {
}

async fn route_notification(&self, notification: &Notification) -> ApiResult<RouterResponse> {
// The notification contains the original subscription information
let user = &notification.subscription.user;
debug!(
"Routing WebPush notification to UAID {}",
Expand Down Expand Up @@ -65,9 +66,8 @@ impl Router for WebPushRouter {
);
}
Err(error) => {
// We should stop sending notifications to this node for this user
debug!("Error while sending webpush notification: {}", error);
self.remove_node_id(user, node_id.clone()).await?;
self.remove_node_id(user, node_id).await?
}
}
}
Expand Down Expand Up @@ -142,7 +142,7 @@ impl Router for WebPushRouter {
Err(error) => {
// Can't communicate with the node, so we should stop using it
debug!("Error while triggering notification check: {}", error);
self.remove_node_id(&user, node_id.clone()).await?;
self.remove_node_id(&user, node_id).await?;
Ok(self.make_stored_response(notification))
}
}
Expand Down Expand Up @@ -186,13 +186,12 @@ impl WebPushRouter {

/// Remove the node ID from a user. This is done if the user is no longer
/// connected to the node.
async fn remove_node_id(&self, user: &DynamoDbUser, node_id: String) -> ApiResult<()> {
async fn remove_node_id(&self, user: &DynamoDbUser, node_id: &str) -> ApiResult<()> {
self.metrics.incr("updates.client.host_gone").ok();

self.ddb
.remove_node_id(user.uaid, node_id, user.connected_at)
.await
.map_err(ApiError::from)
.remove_node_id(user.uaid, node_id.to_owned(), user.connected_at)
.await?;
Ok(())
}

/// Update metrics and create a response for when a notification has been directly forwarded to
Expand Down
17 changes: 8 additions & 9 deletions autopush/src/megaphone.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ use autopush_common::errors::{ApcErrorKind, Result};
use crate::server::protocol::BroadcastValue;

// A Broadcast entry Key in a BroadcastRegistry
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Hash)]
struct BroadcastKey(u32);
type BroadcastKey = u32;

// Broadcasts a client is subscribed to and the last change seen
#[derive(Debug, Default)]
Expand All @@ -20,7 +19,7 @@ pub struct BroadcastSubs {

#[derive(Debug)]
struct BroadcastRegistry {
lookup: HashMap<String, u32>,
lookup: HashMap<String, BroadcastKey>,
table: Vec<String>,
}

Expand All @@ -41,20 +40,20 @@ impl BroadcastRegistry {
// exists
fn add_broadcast(&mut self, broadcast_id: String) -> BroadcastKey {
if let Some(v) = self.lookup.get(&broadcast_id) {
return BroadcastKey(*v);
return *v;
}
let i = self.table.len();
let i = self.table.len() as BroadcastKey;
self.table.push(broadcast_id.clone());
self.lookup.insert(broadcast_id, i as u32);
BroadcastKey(i as u32)
self.lookup.insert(broadcast_id, i);
i
}

fn lookup_id(&self, key: BroadcastKey) -> Option<String> {
self.table.get(key.0 as usize).cloned()
self.table.get(key as usize).cloned()
}

fn lookup_key(&self, broadcast_id: &str) -> Option<BroadcastKey> {
self.lookup.get(broadcast_id).cloned().map(BroadcastKey)
self.lookup.get(broadcast_id).copied()
}
}

Expand Down
4 changes: 4 additions & 0 deletions autopush/src/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ impl AutopushServer {
..Default::default()
},
));
/*
Sentry 0.29+ automatically enables `PanicIntegration`.
see https://docs.rs/sentry-panic/latest/sentry_panic/
*/
Some(guard)
} else {
None
Expand Down

0 comments on commit 7641d18

Please sign in to comment.