mozilla-services · pjenvey · Mar 12, 2021 · Feb 26, 2021 · Feb 26, 2021 · Mar 1, 2021
diff --git a/src/db/spanner/manager/session.rs b/src/db/spanner/manager/session.rs
@@ -7,10 +7,7 @@ use grpcio::{CallOption, ChannelBuilder, ChannelCredentials, Environment, Metada
 use std::sync::Arc;
 use std::time::SystemTime;
 
-use crate::{
-    db::error::{DbError, DbErrorKind},
-    server::metrics::Metrics,
-};
+use crate::{db::error::DbError, server::metrics::Metrics};
 
 const SPANNER_ADDRESS: &str = "spanner.googleapis.com:443";
 
@@ -74,44 +71,62 @@ pub async fn recycle_spanner_session(
     let now = SystemTime::now()
         .duration_since(SystemTime::UNIX_EPOCH)
         .unwrap_or_default()
-        .as_secs();
-    if let Some(max_life) = max_lifetime {
-        // get the current UTC seconds
-        if let Some(age) = conn.session.create_time.clone().into_option() {
-            let age = now - age.seconds as u64;
-            if age > max_life as u64 {
-                metrics.incr("db.connection.max_life");
-                dbg!("### aging out", conn.session.get_name());
-                return Err(DbErrorKind::Expired.into());
+        .as_secs() as i64;
+    let mut req = GetSessionRequest::new();
+    req.set_name(conn.session.get_name().to_owned());
+    /*
+    Connections can sometimes produce GOAWAY errors. GOAWAYs are HTTP2 frame
+    errors that are (usually) sent before a given connection is shut down. It
+    appears that GRPC passes these up the chain. The problem is that since the
+    connection is being closed, further retries will (probably?) also fail. The
+    best course of action is to spin up a new session.
+
+    In theory, UNAVAILABLE-GOAWAY messages are retryable. How we retry them,
+    however, is not so clear. There are a few places in spanner functions where
+    we could possibly do this, but they get complicated quickly. (e.g. pass a
+    `&mut SpannerDb` to `db.execute_async`, but that gets REALLY messy, REALLY
+    fast.)
+
+    For now, we try a slightly different tactic here. Connections can age out
+    both from overall age and from lack of use. We can try to pre-emptively
+    kill off connections before we get the GOAWAY messages. Any additional
+    GOAWAY messages would be returned to the client as a 500 which will
+    result in the client re-trying.
+
+     */
+    match conn.client.get_session_async(&req)?.await {
+        Ok(session) => {
+            if let Some(max_life) = max_lifetime {
+                let create_time = session.get_create_time().seconds;
+                let age = now - create_time;
+                if age > max_life as i64 {
+                    metrics.incr("db.connection.max_life");
+                    dbg!("### aging out", conn.session.get_name());
+                    conn.session = create_session(&conn.client, database_name).await?;
+                }
             }
-        }
-    }
-    // check how long that this has been idle...
-    if let Some(max_idle) = max_idle {
-        if let Some(idle) = conn.session.approximate_last_use_time.clone().into_option() {
-            // get current UTC seconds
-            let idle = std::cmp::max(0, now as i64 - idle.seconds);
-            if idle > max_idle as i64 {
-                metrics.incr("db.connection.max_idle");
-                dbg!("### idling out", conn.session.get_name());
-                return Err(DbErrorKind::Expired.into());
+            // check how long that this has been idle...
+            if let Some(max_idle) = max_idle {
+                let last_use = session.get_approximate_last_use_time().seconds;
+                let idle = std::cmp::max(0, now - last_use);
+                if idle > max_idle as i64 {
+                    metrics.incr("db.connection.max_idle");
+                    dbg!("### idling out", session.get_name());
+                    conn.session = create_session(&conn.client, database_name).await?;
+                }
             }
+            Ok(())
         }
-    }
-
-    let mut req = GetSessionRequest::new();
-    req.set_name(conn.session.get_name().to_owned());
-    if let Err(e) = conn.client.get_session_async(&req)?.await {
-        match e {
+        Err(e) => match e {
             grpcio::Error::RpcFailure(ref status)
                 if status.status == grpcio::RpcStatusCode::NOT_FOUND =>
             {
                 conn.session = create_session(&conn.client, database_name).await?;
+                Ok(())
             }
             _ => return Err(e.into()),
-        }
+        },
     }
-    Ok(())
 }
 
 async fn create_session(