From 669009edfc4f7f1fe85a1de641e16f831eee45d6 Mon Sep 17 00:00:00 2001 From: Ziheng Wang Date: Thu, 4 Aug 2022 00:27:23 -0700 Subject: [PATCH] ARROW-16521 [C++][Python] Configure curl timeout policy for S3 (#13385) added timeout options to c++ and python s3fs Lead-authored-by: Ziheng Wang Co-authored-by: Ziheng Wang Co-authored-by: Ziheng Wang Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/filesystem/s3fs.cc | 17 +++++++++++++---- cpp/src/arrow/filesystem/s3fs.h | 11 +++++++++++ python/pyarrow/_s3fs.pyx | 22 ++++++++++++++++++---- python/pyarrow/includes/libarrow_fs.pxd | 2 ++ python/pyarrow/tests/test_fs.py | 9 +++++++++ 5 files changed, 53 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index dd3973ba7717e..aafb9a7cbf13e 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -386,10 +386,11 @@ bool S3Options::Equals(const S3Options& other) const { default_metadata_size ? (other.default_metadata && other.default_metadata->Equals(*default_metadata)) : (!other.default_metadata || other.default_metadata->size() == 0); - return (region == other.region && endpoint_override == other.endpoint_override && - scheme == other.scheme && role_arn == other.role_arn && - session_name == other.session_name && external_id == other.external_id && - load_frequency == other.load_frequency && + return (region == other.region && connect_timeout == other.connect_timeout && + request_timeout == other.request_timeout && + endpoint_override == other.endpoint_override && scheme == other.scheme && + role_arn == other.role_arn && session_name == other.session_name && + external_id == other.external_id && load_frequency == other.load_frequency && proxy_options.Equals(other.proxy_options) && credentials_kind == other.credentials_kind && background_writes == other.background_writes && @@ -718,6 +719,14 @@ class ClientBuilder { if (!options_.region.empty()) { client_config_.region = ToAwsString(options_.region); } + if (options_.request_timeout > 0) { + // Use ceil() to avoid setting it to 0 as that probably means no timeout. + client_config_.requestTimeoutMs = ceil(options_.request_timeout * 1000); + } + if (options_.connect_timeout > 0) { + client_config_.connectTimeoutMs = ceil(options_.connect_timeout * 1000); + } + client_config_.endpointOverride = ToAwsString(options_.endpoint_override); if (options_.scheme == "http") { client_config_.scheme = Aws::Http::Scheme::HTTP; diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 05fa404162aba..3f578aedb27c8 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -103,6 +103,17 @@ struct ARROW_EXPORT S3Options { /// server). std::string region; + /// \brief Socket connection timeout, in seconds + /// + /// If negative, the AWS SDK default value is used (typically 1 second). + double connect_timeout = -1; + + /// \brief Socket read timeout on Windows and macOS, in seconds + /// + /// If negative, the AWS SDK default value is used (typically 3 seconds). + /// This option is ignored on non-Windows, non-macOS systems. + double request_timeout = -1; + /// If non-empty, override region with a connect string such as "localhost:9000" // XXX perhaps instead take a URL like "http://localhost:9000"? std::string endpoint_override; diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index f668038e62381..47cb87c23d214 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -137,6 +137,13 @@ cdef class S3FileSystem(FileSystem): assumed role session will be refreshed. region : str, default 'us-east-1' AWS region to connect to. + request_timeout : double, default None + Socket read timeouts on Windows and macOS, in seconds. + If omitted, the AWS SDK default value is used (typically 3 seconds). + This option is ignored on non-Windows, non-macOS systems. + connect_timeout : double, default None + Socket connection timeout, in seconds. + If omitted, the AWS SDK default value is used (typically 1 second). scheme : str, default 'https' S3 connection transport scheme. endpoint_override : str, default None @@ -183,10 +190,11 @@ cdef class S3FileSystem(FileSystem): CS3FileSystem* s3fs def __init__(self, *, access_key=None, secret_key=None, session_token=None, - bint anonymous=False, region=None, scheme=None, - endpoint_override=None, bint background_writes=True, - default_metadata=None, role_arn=None, session_name=None, - external_id=None, load_frequency=900, proxy_options=None, + bint anonymous=False, region=None, request_timeout=None, + connect_timeout=None, scheme=None, endpoint_override=None, + bint background_writes=True, default_metadata=None, + role_arn=None, session_name=None, external_id=None, + load_frequency=900, proxy_options=None, allow_bucket_creation=False, allow_bucket_deletion=False): cdef: CS3Options options @@ -254,6 +262,10 @@ cdef class S3FileSystem(FileSystem): if region is not None: options.region = tobytes(region) + if request_timeout is not None: + options.request_timeout = request_timeout + if connect_timeout is not None: + options.connect_timeout = connect_timeout if scheme is not None: options.scheme = tobytes(scheme) if endpoint_override is not None: @@ -324,6 +336,8 @@ cdef class S3FileSystem(FileSystem): CS3CredentialsKind_Anonymous), region=frombytes(opts.region), scheme=frombytes(opts.scheme), + connect_timeout=opts.connect_timeout, + request_timeout=opts.request_timeout, endpoint_override=frombytes(opts.endpoint_override), role_arn=frombytes(opts.role_arn), session_name=frombytes(opts.session_name), diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index d47b462aa5c94..69d5dc0ebe573 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -152,6 +152,8 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: cdef cppclass CS3Options "arrow::fs::S3Options": c_string region + double connect_timeout + double request_timeout c_string endpoint_override c_string scheme c_bool background_writes diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 05ebf4ed4c72f..238bcb73b6e99 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1131,6 +1131,15 @@ def test_s3_options(): assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs + fs = S3FileSystem(request_timeout=0.5, connect_timeout=0.25) + assert isinstance(fs, S3FileSystem) + assert pickle.loads(pickle.dumps(fs)) == fs + + fs2 = S3FileSystem(request_timeout=0.25, connect_timeout=0.5) + assert isinstance(fs2, S3FileSystem) + assert pickle.loads(pickle.dumps(fs2)) == fs2 + assert fs2 != fs + with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError):