Skip to content

Commit

Permalink
Initialize S3 on first use
Browse files Browse the repository at this point in the history
Initializing S3 on import may cause undesired consequences for users who
do not use S3:
- Longer import times;
- Consumption of unnecessary resources, e.g., AWS event loop thread(s);
- Potential exposure to bugs in S3 package dependencies.

Therefore, a more appropriate way to handle S3 initialization seems to
be to move it to its first use.
  • Loading branch information
pentschev committed Oct 20, 2023
1 parent 394966b commit 5c5f30a
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 5 deletions.
32 changes: 31 additions & 1 deletion python/pyarrow/_s3fs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ cpdef enum S3LogLevel:
Debug = <int8_t> CS3LogLevel_Debug
Trace = <int8_t> CS3LogLevel_Trace

# Prevent registration of multiple `atexit` handlers
_initialized = False


def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal, int num_event_loop_threads=1):
"""
Expand All @@ -63,7 +66,14 @@ def ensure_s3_initialized():
"""
Initialize S3 (with default options) if not already initialized
"""
check_status(CEnsureS3Initialized())
global _initialized
cdef int status

if not _initialized:
check_status(CEnsureS3Initialized())
import atexit
atexit.register(finalize_s3)
_initialized = True


def finalize_s3():
Expand Down Expand Up @@ -260,6 +270,26 @@ cdef class S3FileSystem(FileSystem):
load_frequency=900, proxy_options=None,
allow_bucket_creation=False, allow_bucket_deletion=False,
retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3)):
ensure_s3_initialized()

self._initialize_s3(access_key=access_key, secret_key=secret_key, session_token=session_token,
anonymous=anonymous, region=region, request_timeout=request_timeout,
connect_timeout=connect_timeout, scheme=scheme, endpoint_override=endpoint_override,
background_writes=background_writes, default_metadata=default_metadata,
role_arn=role_arn, session_name=session_name, external_id=external_id,
load_frequency=load_frequency, proxy_options=proxy_options,
allow_bucket_creation=allow_bucket_creation, allow_bucket_deletion=allow_bucket_deletion,
retry_strategy=retry_strategy)

def _initialize_s3(self, *, access_key=None, secret_key=None, session_token=None,
bint anonymous=False, region=None, request_timeout=None,
connect_timeout=None, scheme=None, endpoint_override=None,
bint background_writes=True, default_metadata=None,
role_arn=None, session_name=None, external_id=None,
load_frequency=900, proxy_options=None,
allow_bucket_creation=False, allow_bucket_deletion=False,
retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3)):

cdef:
optional[CS3Options] options
shared_ptr[CS3FileSystem] wrapped
Expand Down
4 changes: 0 additions & 4 deletions python/pyarrow/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,6 @@
finalize_s3, initialize_s3, resolve_s3_region)
except ImportError:
_not_imported.append("S3FileSystem")
else:
ensure_s3_initialized()
import atexit
atexit.register(finalize_s3)


def __getattr__(name):
Expand Down

0 comments on commit 5c5f30a

Please sign in to comment.