From 5dc5ee737550f461b25c28f6a5cb79c901e6a3c5 Mon Sep 17 00:00:00 2001 From: rahulb-yb Date: Mon, 1 Jul 2024 12:28:25 +0000 Subject: [PATCH] [#22849] YSQL: Correctly handle reset phase timeout errors in YSQL Connection Manager Summary: **Problem Statement:** While a logical client detaches from its corresponding server connection, Connection Manager fires certain RESET queries before allowing the server connection to return back to the pool of usable connections for other logical clients to use. The upstream code has made it such that if these RESET queries take longer than 1 second to execute and return cumulatively, then odyssey breaks the connection and forwards its own error to the client with the message `Resource temporarily unavailable`. It is to be noted that there were no errors while executing the client's workload, and that this is an error formed by the Connection Manager layer itself. **Solution/Changes Made:** Since there were technically no errors due to the client's workload, it makes no sense to forward any error to the logical client - we can instead silently close the server connection when it is found that we hit the reset timeout during the reset phase. Additionally, increase the reset timeout in this diff as the client's workload is already pipelined and is not dependent on the server's reset phase completing. It will help save some cycles internally if we can re-use the same server connection. We modify a function, `od_backend_query` to return a different status when the reset timeout is hit, which can be used for any "reset" query that is fired at the time of detaching the logical and physical connection. Some examples of this can be seen with the help of a small excerpt from the conn mgr template file, picked up from pool config settings: ``` pool_discard no pool_rollback yes ``` Each of these values correspond to what should be done to the server connection with respect to "resetting" the context on it as per the user's wish - `discard` fires a "discard all" query - `rollback` fires a "rollback" if it is found that the server is in an active transaction before firing reset statements - By default, we fire a query that contains "reset all" "reset role" and "reset session authorization" All of the above scenarios use the same function being modified to return a different status depending on the reset type Jira: DB-11746 Test Plan: Jenkins: enable connection manager, all tests Reviewers: skumar, asrinivasan Reviewed By: asrinivasan Subscribers: yql Differential Revision: https://phorge.dev.yugabyte.com/D35564 --- src/odyssey/sources/backend.c | 4 +++- src/odyssey/sources/reset.c | 14 +++++++++----- src/odyssey/sources/router.c | 5 ++++- src/odyssey/sources/server.h | 3 +++ 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/odyssey/sources/backend.c b/src/odyssey/sources/backend.c index b204f7856880..fac07f711f6a 100644 --- a/src/odyssey/sources/backend.c +++ b/src/odyssey/sources/backend.c @@ -19,6 +19,7 @@ void od_backend_close(od_server_t *server) assert(server->tls == NULL); server->is_transaction = 0; server->yb_sticky_connection = false; + server->reset_timeout = false; server->idle_time = 0; kiwi_key_init(&server->key); kiwi_key_init(&server->key_client); @@ -736,7 +737,8 @@ int od_backend_ready_wait(od_server_t *server, char *context, int count, "read error: %s", od_io_error(&server->io)); } - return -1; + /* return new status if timeout error */ + return -2; } kiwi_be_type_t type = *(char *)machine_msg_data(msg); od_debug(&instance->logger, context, server->client, server, diff --git a/src/odyssey/sources/reset.c b/src/odyssey/sources/reset.c index c783cd403822..b0e504570cfa 100644 --- a/src/odyssey/sources/reset.c +++ b/src/odyssey/sources/reset.c @@ -48,7 +48,7 @@ int od_reset(od_server_t *server) * * 3. Continue with (1) */ - int wait_timeout = 1000; + int wait_timeout = 5000; int wait_try = 0; int wait_try_cancel = 0; int wait_cancel_limit = 1; @@ -66,7 +66,8 @@ int od_reset(od_server_t *server) wait_try++; rc = od_backend_ready_wait(server, "reset", 1, wait_timeout); - if (rc == -1) + /* can be -1 or -2 */ + if (rc < 0) break; } if (rc == -1) { @@ -113,7 +114,7 @@ int od_reset(od_server_t *server) query_rlb, NULL, sizeof(query_rlb), wait_timeout, 1); - if (rc == -1) + if (rc < 0) goto error; assert(!server->is_transaction); } @@ -125,7 +126,7 @@ int od_reset(od_server_t *server) rc = od_backend_query(server, "reset-discard", query_discard, NULL, sizeof(query_discard), wait_timeout, 1); - if (rc == NOT_OK_RESPONSE) + if (rc < 0) goto error; } @@ -136,7 +137,7 @@ int od_reset(od_server_t *server) rc = od_backend_query(server, "reset-discard-smart", query_discard, NULL, sizeof(query_discard), wait_timeout, 1); - if (rc == NOT_OK_RESPONSE) + if (rc < 0) goto error; } @@ -147,6 +148,9 @@ int od_reset(od_server_t *server) NULL, sizeof(query_reset), wait_timeout, 1); if (rc == -1) goto error; + /* reset timeout */ + if (rc == -2) + server->reset_timeout = true; } /* ready */ diff --git a/src/odyssey/sources/router.c b/src/odyssey/sources/router.c index 485615ba930d..5855815c4dd1 100644 --- a/src/odyssey/sources/router.c +++ b/src/odyssey/sources/router.c @@ -725,8 +725,11 @@ void od_router_detach(od_router_t *router, od_client_t *client) * a. Creating TEMP TABLES. * b. Use of WITH HOLD CURSORS. * c. Client connection is a logical or physical replication connection + * d. It took too long to reset state on the server. */ - if (od_likely(!server->offline) && !server->yb_sticky_connection) { + if (od_likely(!server->offline) && + !server->yb_sticky_connection && + !server->reset_timeout) { od_instance_t *instance = server->global->instance; if (route->id.physical_rep || route->id.logical_rep) { od_debug(&instance->logger, "expire-replication", NULL, diff --git a/src/odyssey/sources/server.h b/src/odyssey/sources/server.h index f65123d6b6a4..1580720af92a 100644 --- a/src/odyssey/sources/server.h +++ b/src/odyssey/sources/server.h @@ -58,7 +58,9 @@ struct od_server { od_list_t link; + /* YB */ bool yb_sticky_connection; + bool reset_timeout; }; static const size_t OD_SERVER_DEFAULT_HASHMAP_SZ = 420; @@ -84,6 +86,7 @@ static inline void od_server_init(od_server_t *server, int reserve_prep_stmts) server->endpoint_selector = 0; od_stat_state_init(&server->stats_state); server->yb_sticky_connection = false; + server->reset_timeout = false; #ifdef USE_SCRAM od_scram_state_init(&server->scram_state);