From 354878232160e9d8b4a7909fd1f2b40ff9749163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Jer=C5=A1e?= Date: Mon, 9 Dec 2024 14:03:17 +0100 Subject: [PATCH 1/4] Catch EndpointConnectionError in retry decorator --- docs/CHANGELOG.rst | 9 +++++++++ resolwe/storage/connectors/transfer.py | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 991d18149..f4a9313b3 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -6,6 +6,15 @@ All notable changes to this project are documented in this file. This project adheres to `Semantic Versioning `_. +========== +Unreleased +========== + +Changed +------- +- Catch ``EndpointConnectionError`` in retry decorator + + =================== 42.0.3 - 2024-12-09 =================== diff --git a/resolwe/storage/connectors/transfer.py b/resolwe/storage/connectors/transfer.py index 986bd95bd..45499cb00 100644 --- a/resolwe/storage/connectors/transfer.py +++ b/resolwe/storage/connectors/transfer.py @@ -30,9 +30,9 @@ gcs_exceptions = [] try: - from botocore.exceptions import ClientError + from botocore.exceptions import ClientError, EndpointConnectionError - boto_exceptions = [ClientError] + boto_exceptions = [ClientError, EndpointConnectionError] except ModuleNotFoundError: boto_exceptions = [] From 22ad0fc9d6074175a106919736b056a44cd5bc24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Jer=C5=A1e?= Date: Mon, 9 Dec 2024 14:14:34 +0100 Subject: [PATCH 2/4] Implement exponential backoff in retry decorator --- docs/CHANGELOG.rst | 1 + resolwe/storage/connectors/transfer.py | 12 ++++++++---- resolwe/storage/tests/test_transfer.py | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index f4a9313b3..783216d11 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -13,6 +13,7 @@ Unreleased Changed ------- - Catch ``EndpointConnectionError`` in retry decorator +- Implement exponential backoff in retry decorator =================== diff --git a/resolwe/storage/connectors/transfer.py b/resolwe/storage/connectors/transfer.py index 45499cb00..c33ae3ad4 100644 --- a/resolwe/storage/connectors/transfer.py +++ b/resolwe/storage/connectors/transfer.py @@ -39,7 +39,8 @@ logger = logging.getLogger(__name__) ERROR_MAX_RETRIES = 3 -ERROR_TIMEOUT = 5 # In seconds. +ERROR_INITIAL_TIMEOUT = 1 # In seconds. +ERROR_MAX_TIMEOUT = 60 # In seconds. transfer_exceptions = tuple( boto_exceptions + gcs_exceptions @@ -56,14 +57,17 @@ def retry_on_transfer_error(wrapped, instance, args, kwargs): return wrapped(*args, **kwargs) except transfer_exceptions: # Log the exception on retry for inspection. - if retry != ERROR_MAX_RETRIES: + if retry < ERROR_MAX_RETRIES: + timeout = min( + ERROR_MAX_TIMEOUT, ERROR_INITIAL_TIMEOUT * (2 ** (retry - 1)) + ) logger.exception( "Retry %d/%d got exception, will retry in %d seconds.", retry, ERROR_MAX_RETRIES, - ERROR_TIMEOUT, + timeout, ) - sleep(ERROR_TIMEOUT) + sleep(timeout) # Raise exception when max retries are exceeded. else: logger.exception("Final retry got exception, re-raising it.") diff --git a/resolwe/storage/tests/test_transfer.py b/resolwe/storage/tests/test_transfer.py index 6038ed93a..720825e26 100644 --- a/resolwe/storage/tests/test_transfer.py +++ b/resolwe/storage/tests/test_transfer.py @@ -41,7 +41,8 @@ def test_exception(self): with self.assertRaises(DataTransferError): t.transfer_objects("test_url", [{}, {}]) - @patch("resolwe.storage.connectors.transfer.ERROR_TIMEOUT", 0.1) + @patch("resolwe.storage.connectors.transfer.ERROR_INITIAL_TIMEOUT", 0.1) + @patch("resolwe.storage.connectors.transfer.ERROR_MAX_TIMEOUT", 0.1) @patch("resolwe.storage.connectors.transfer.ERROR_MAX_RETRIES", 3) def test_retry_transfer(self): t = Transfer(self.local, self.local) From f21e7d294adab2dc4b98852e1d51b8bd6381a9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Jer=C5=A1e?= Date: Wed, 4 Dec 2024 14:35:28 +0100 Subject: [PATCH 3/4] Make number of download threads configurable and reduce them to 3 by default. --- docs/CHANGELOG.rst | 1 + resolwe/flow/executors/init_container.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index 783216d11..b19f04f35 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -14,6 +14,7 @@ Changed ------- - Catch ``EndpointConnectionError`` in retry decorator - Implement exponential backoff in retry decorator +- Reduce the maximal number of threads for downolad data in the init container =================== diff --git a/resolwe/flow/executors/init_container.py b/resolwe/flow/executors/init_container.py index ab9294ac9..be9fc8a1e 100644 --- a/resolwe/flow/executors/init_container.py +++ b/resolwe/flow/executors/init_container.py @@ -56,6 +56,9 @@ DOWNLOAD_WAITING_TIMEOUT = 60 # in seconds RETRIES = 5 +# Max threads to use for data download. +MAX_DOWNLOAD_THREADS = int(os.environ.get("GENESIS_MAX_DOWNLOAD_THREADS", 3)) + # Configure container logger. All logs are output to stdout for further # processing. # The log level defaults to debug except for boto and google loggers, which @@ -112,8 +115,8 @@ async def transfer_inputs(communicator: BaseCommunicator, missing_data: dict): try: for connector_name in objects_to_transfer: - min_threads = 5 - max_threads = 20 + min_threads = 1 + max_threads = MAX_DOWNLOAD_THREADS # Use from min_threads to max_threads threads. Assume each thread # can handle at least 100 files in reasonable time. files_count = len(objects_to_transfer[connector_name]) From 4379fbec131bcc7c386aa69a90d744ae89e3f71a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gregor=20Jer=C5=A1e?= Date: Mon, 9 Dec 2024 14:15:09 +0100 Subject: [PATCH 4/4] Prepare release 42.0.4 --- docs/CHANGELOG.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index b19f04f35..880b12f7b 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -6,9 +6,9 @@ All notable changes to this project are documented in this file. This project adheres to `Semantic Versioning `_. -========== -Unreleased -========== +=================== +42.0.4 - 2024-12-09 +=================== Changed -------