From 091dddb9b2baf1f4b481e8d7961d451b71a8508b Mon Sep 17 00:00:00 2001 From: Gaole Meng Date: Tue, 31 Jan 2023 13:34:44 -0800 Subject: [PATCH] fix: remove unrecoverable connection from connection pool during multiplexing (#1967) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Split writer into connection worker and wrapper, this is a prerequisite for multiplexing client * feat: add connection worker pool skeleton, used for multiplexing client * feat: add Load api for connection worker for multiplexing client * feat: add multiplexing support to connection worker. We will treat every new stream name as a switch of destinationt * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: port the multiplexing client core algorithm and basic tests also fixed a tiny bug inside fake bigquery write impl for getting thre response from offset * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: wire multiplexing connection pool to stream writer * feat: some fixes for multiplexing client * feat: fix some todos, and reject the mixed behavior of passed in client or not * feat: fix the bug that we may peek into the write_stream field but it's possible the proto schema does not contain this field * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: fix the bug that we may peek into the write_stream field but it's possible the proto schema does not contain this field * feat: add getInflightWaitSeconds implementation * feat: Add schema comparision in connection loop to ensure schema update for the same stream name can be notified * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * feat: add schema update support to multiplexing * fix: fix windows build bug: windows Instant resolution is different with linux * fix: fix another failing tests for windows build * fix: fix another test failure for Windows build * feat: Change new thread for each retry to be a thread pool to avoid create/tear down too much threads if lots of retries happens * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix: add back the background executor provider that's accidentally removed * feat: throw error when use connection pool for explicit stream * fix: Add precision truncation to the passed in value from JSON float and double type. * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * modify the bom version * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix deadlockissue in ConnectionWorkerPool * fix: fix deadlock issue during close + append for multiplexing * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix: fix one potential root cause of deadlock issue for non-multiplexing case * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Add timeout to inflight queue waiting, and also add some extra log * feat: allow java client lib handle switch table schema for the same stream name * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- .../bigquery/storage/v1/ConnectionWorker.java | 5 + .../storage/v1/ConnectionWorkerPool.java | 23 +++- .../bigquery/storage/v1/StreamWriter.java | 10 ++ .../bigquery/storage/v1/StreamWriterTest.java | 110 ++++++++++++++++++ 4 files changed, 146 insertions(+), 2 deletions(-) diff --git a/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorker.java b/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorker.java index b3b2c19199..0060ad7314 100644 --- a/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorker.java +++ b/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorker.java @@ -378,6 +378,11 @@ public String getWriterId() { return writerId; } + boolean isConnectionInUnrecoverableState() { + // If final status is set, there's no + return connectionFinalStatus != null; + } + /** Close the stream writer. Shut down all resources. */ @Override public void close() { diff --git a/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorkerPool.java b/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorkerPool.java index 0e6b5eab3a..c4e68bb189 100644 --- a/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorkerPool.java +++ b/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/ConnectionWorkerPool.java @@ -234,9 +234,17 @@ ApiFuture append(StreamWriter streamWriter, ProtoRows rows, streamWriter, (key, existingStream) -> { // Stick to the existing stream if it's not overwhelmed. - if (existingStream != null && !existingStream.getLoad().isOverwhelmed()) { + if (existingStream != null + && !existingStream.getLoad().isOverwhelmed() + && !existingStream.isConnectionInUnrecoverableState()) { return existingStream; } + if (existingStream != null && existingStream.isConnectionInUnrecoverableState()) { + existingStream = null; + } + // Before search for the next connection to attach, clear the finalized connections + // first so that they will not be selected. + clearFinalizedConnectionWorker(); // Try to create or find another existing stream to reuse. ConnectionWorker createdOrExistingConnection = null; try { @@ -299,7 +307,6 @@ private ConnectionWorker createOrReuseConnectionWorker( } return createConnectionWorker(streamWriter.getStreamName(), streamWriter.getProtoSchema()); } else { - // Stick to the original connection if all the connections are overwhelmed. if (existingConnectionWorker != null) { return existingConnectionWorker; @@ -310,6 +317,18 @@ private ConnectionWorker createOrReuseConnectionWorker( } } + private void clearFinalizedConnectionWorker() { + Set connectionWorkerSet = new HashSet<>(); + for (ConnectionWorker existingWorker : connectionWorkerPool) { + if (existingWorker.isConnectionInUnrecoverableState()) { + connectionWorkerSet.add(existingWorker); + } + } + for (ConnectionWorker workerToRemove : connectionWorkerSet) { + connectionWorkerPool.remove(workerToRemove); + } + } + /** Select out the best connection worker among the given connection workers. */ static ConnectionWorker pickBestLoadConnection( Comparator comparator, List connectionWorkerList) { diff --git a/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/StreamWriter.java b/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/StreamWriter.java index edc7240ad7..337ff86a66 100644 --- a/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/StreamWriter.java +++ b/google-cloud-bigquerystorage/src/main/java/com/google/cloud/bigquery/storage/v1/StreamWriter.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.time.Duration; import java.util.Map; +import java.util.Map.Entry; import java.util.Objects; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; @@ -450,6 +451,15 @@ static void cleanUp() { connectionPoolMap.clear(); } + @VisibleForTesting + ConnectionWorkerPool getTestOnlyConnectionWorkerPool() { + ConnectionWorkerPool connectionWorkerPool = null; + for (Entry entry : connectionPoolMap.entrySet()) { + connectionWorkerPool = entry.getValue(); + } + return connectionWorkerPool; + } + /** A builder of {@link StreamWriter}s. */ public static final class Builder { private static final long DEFAULT_MAX_INFLIGHT_REQUESTS = 1000L; diff --git a/google-cloud-bigquerystorage/src/test/java/com/google/cloud/bigquery/storage/v1/StreamWriterTest.java b/google-cloud-bigquerystorage/src/test/java/com/google/cloud/bigquery/storage/v1/StreamWriterTest.java index eacfdcb40f..6426bc6ca1 100644 --- a/google-cloud-bigquerystorage/src/test/java/com/google/cloud/bigquery/storage/v1/StreamWriterTest.java +++ b/google-cloud-bigquerystorage/src/test/java/com/google/cloud/bigquery/storage/v1/StreamWriterTest.java @@ -69,6 +69,7 @@ public class StreamWriterTest { private static final Logger log = Logger.getLogger(StreamWriterTest.class.getName()); private static final String TEST_STREAM_1 = "projects/p/datasets/d1/tables/t1/streams/_default"; private static final String TEST_STREAM_2 = "projects/p/datasets/d2/tables/t2/streams/_default"; + private static final String TEST_STREAM_3 = "projects/p/datasets/d3/tables/t3/streams/_default"; private static final String TEST_STREAM_SHORTEN = "projects/p/datasets/d2/tables/t2/_default"; private static final String EXPLICIT_STEAM = "projects/p/datasets/d1/tables/t1/streams/s1"; private static final String TEST_TRACE_ID = "DATAFLOW:job_id"; @@ -1090,6 +1091,115 @@ public void testExtractDatasetName() throws Exception { Assert.assertTrue(ex.getMessage().contains("The passed in stream name does not match")); } + @Test + public void testRetryInUnrecoverableStatus_MultiplexingCase() throws Exception { + ConnectionWorkerPool.setOptions( + Settings.builder().setMinConnectionsPerRegion(1).setMaxConnectionsPerRegion(4).build()); + ConnectionWorkerPool.enableTestingLogic(); + + // Setup: create three stream writers, two of them are writing to the same stream. + // Those four stream writers should be assigned to the same connection. + // 1. Submit three requests at first to trigger connection retry limitation. + // 2. At this point the connection should be entering a unrecoverable state. + // 3. Further submit requests to those stream writers would trigger connection reassignment. + StreamWriter writer1 = getMultiplexingStreamWriter(TEST_STREAM_1); + StreamWriter writer2 = getMultiplexingStreamWriter(TEST_STREAM_2); + StreamWriter writer3 = getMultiplexingStreamWriter(TEST_STREAM_3); + StreamWriter writer4 = getMultiplexingStreamWriter(TEST_STREAM_3); + + testBigQueryWrite.setCloseForeverAfter(2); + testBigQueryWrite.setTimesToClose(1); + testBigQueryWrite.addResponse(createAppendResponse(0)); + testBigQueryWrite.addResponse(createAppendResponse(1)); + + // Connection will be failed after triggering the third append. + ApiFuture appendFuture1 = sendTestMessage(writer1, new String[] {"A"}, 0); + ApiFuture appendFuture2 = sendTestMessage(writer2, new String[] {"B"}, 1); + ApiFuture appendFuture3 = sendTestMessage(writer3, new String[] {"C"}, 2); + TimeUnit.SECONDS.sleep(1); + assertEquals(0, appendFuture1.get().getAppendResult().getOffset().getValue()); + assertEquals(1, appendFuture2.get().getAppendResult().getOffset().getValue()); + assertThrows( + ExecutionException.class, + () -> { + assertEquals(2, appendFuture3.get().getAppendResult().getOffset().getValue()); + }); + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getTotalConnectionCount(), 1); + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getCreateConnectionCount(), 1); + + // Insert another request to the writer attached to closed connection would create another + // connection. + + testBigQueryWrite.setCloseForeverAfter(0); + testBigQueryWrite.addResponse(createAppendResponse(4)); + testBigQueryWrite.addResponse(createAppendResponse(5)); + testBigQueryWrite.addResponse(createAppendResponse(6)); + ApiFuture appendFuture4 = sendTestMessage(writer4, new String[] {"A"}, 2); + ApiFuture appendFuture5 = sendTestMessage(writer1, new String[] {"A"}, 3); + ApiFuture appendFuture6 = sendTestMessage(writer2, new String[] {"B"}, 4); + assertEquals(4, appendFuture4.get().getAppendResult().getOffset().getValue()); + assertEquals(5, appendFuture5.get().getAppendResult().getOffset().getValue()); + assertEquals(6, appendFuture6.get().getAppendResult().getOffset().getValue()); + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getTotalConnectionCount(), 1); + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getCreateConnectionCount(), 2); + + writer1.close(); + writer2.close(); + writer3.close(); + writer4.close(); + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getTotalConnectionCount(), 0); + } + + @Test + public void testCloseWhileInUnrecoverableState() throws Exception { + ConnectionWorkerPool.setOptions( + Settings.builder().setMinConnectionsPerRegion(1).setMaxConnectionsPerRegion(4).build()); + ConnectionWorkerPool.enableTestingLogic(); + + // Setup: create three stream writers + // 1. Submit three requests at first to trigger connection retry limitation. + // 2. Submit request to writer3 to trigger reassignment + // 3. Close the previous two writers would be succesful + StreamWriter writer1 = getMultiplexingStreamWriter(TEST_STREAM_1); + StreamWriter writer2 = getMultiplexingStreamWriter(TEST_STREAM_2); + StreamWriter writer3 = getMultiplexingStreamWriter(TEST_STREAM_3); + + testBigQueryWrite.setCloseForeverAfter(2); + testBigQueryWrite.setTimesToClose(1); + testBigQueryWrite.addResponse(createAppendResponse(0)); + testBigQueryWrite.addResponse(createAppendResponse(1)); + + // Connection will be failed after triggering the third append. + ApiFuture appendFuture1 = sendTestMessage(writer1, new String[] {"A"}, 0); + ApiFuture appendFuture2 = sendTestMessage(writer2, new String[] {"B"}, 1); + ApiFuture appendFuture3 = sendTestMessage(writer3, new String[] {"C"}, 2); + TimeUnit.SECONDS.sleep(1); + assertEquals(0, appendFuture1.get().getAppendResult().getOffset().getValue()); + assertEquals(1, appendFuture2.get().getAppendResult().getOffset().getValue()); + assertThrows( + ExecutionException.class, + () -> { + assertEquals(2, appendFuture3.get().getAppendResult().getOffset().getValue()); + }); + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getTotalConnectionCount(), 1); + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getCreateConnectionCount(), 1); + + writer1.close(); + writer2.close(); + // We will still be left with one request + assertEquals(writer1.getTestOnlyConnectionWorkerPool().getCreateConnectionCount(), 1); + } + + public StreamWriter getMultiplexingStreamWriter(String streamName) throws IOException { + return StreamWriter.newBuilder(streamName, client) + .setWriterSchema(createProtoSchema()) + .setEnableConnectionPool(true) + .setMaxInflightRequests(10) + .setLocation("US") + .setMaxRetryDuration(java.time.Duration.ofMillis(100)) + .build(); + } + // Timeout to ensure close() doesn't wait for done callback timeout. @Test(timeout = 10000) public void testCloseDisconnectedStream() throws Exception {