diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index f6324c1c0a96d..0f7357ea5b31f 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -46,6 +46,32 @@ "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, + { + "name": "emscripten-overrides", + "hidden": true, + "cacheVariables": { + "ARROW_BUILD_SHARED": "OFF", + "ARROW_BUILD_STATIC": "ON", + "ARROW_BUILD_TESTS": "OFF", + "ARROW_ENABLE_THREADING": "OFF", + "ARROW_CUDA": "OFF", + "ARROW_MIMALLOC": "OFF", + "ARROW_JEMALLOC": "OFF", + "ARROW_S3": "OFF", + "ARROW_DEPENDENCY_SOURCE": "BUNDLED", + "ZLIB_SOURCE": "SYSTEM", + "ARROW_IPC": "OFF", + "ARROW_ORC": "OFF", + "ARROW_SUBSTRAIT": "OFF", + "ARROW_DEPENDENCY_USE_SHARED": "OFF", + "ARROW_WITH_BROTLI": "OFF", + "ARROW_SIMD_LEVEL":"NONE", + "ARROW_RUNTIME_SIMD_LEVEL":"NONE", + "CMAKE_C_BYTE_ORDER":"LITTLE_ENDIAN", + "ARROW_WITH_OPENTELEMETRY":"OFF", + "CMAKE_TOOLCHAIN_FILE": { "type": "PATH", "value": "${sourceDir}/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake" } + } + }, { "name": "features-minimal", "hidden": true, @@ -395,6 +421,16 @@ "displayName": "Release build for PyArrow with everything enabled", "cacheVariables": {} }, + { + "name": "ninja-release-emscripten-python", + "inherits": [ + "emscripten-overrides", + "base-release", + "features-python" + ], + "displayName": "Release build which builds an emscripten library, plus PyArrow for Pyodide", + "cacheVariables": {} + }, { "name": "ninja-release-maximal", "inherits": [ diff --git a/cpp/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake b/cpp/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake new file mode 100644 index 0000000000000..d8e49521dba94 --- /dev/null +++ b/cpp/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Force some variables for emscripten +# to disable things that won't work there + +# make us be on the platforms list for cmake +get_filename_component(PLATFORM_FOLDER_PARENT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY) +list(APPEND CMAKE_MODULE_PATH "${PLATFORM_FOLDER_PARENT}") + +include($ENV{EMSDK}/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake) + +# ensure zlib is built with -fpic +# and force us to link to the version in emscripten ports +if(NOT EXISTS ${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a) + execute_process(COMMAND embuilder --pic --force build zlib) +endif() +set(ZLIB_LIBRARY ${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a) + +# # override default in emscripten which is to not use shared libs +set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS TRUE) + +# if we leave the system name as Emscripten, then it reloads the original Emscripten.cmake every time a project() command +# is run, which does bad things like disabling shared libraries +set(CMAKE_SYSTEM_NAME EmscriptenOverrides) + +set(CMAKE_C_FLAGS "-sUSE_ZLIB=1 -sSIDE_MODULE=1 -fPIC -fexceptions") +set(CMAKE_CXX_FLAGS "-sUSE_ZLIB=1 -sSIDE_MODULE=1 -fPIC -fexceptions") + +#set(PYARROW_CPP_HOME "$ENV{ARROW_HOME}/lib") +#list(APPEND CMAKE_FIND_ROOT_PATH "${CMAKE_INSTALL_PREFIX}/cmake") + +set(Python3_INCLUDE_DIR $ENV{PYTHONINCLUDE}) +set(Python3_LIBRARY $ENV{CPYTHONLIB}) +set(Python3_NumPy_INCLUDE_DIR $ENV{NUMPY_LIB}/core/include) +set(Python3_EXECUTABLE) +set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-sUSE_ZLIB=1 -sWASM_BIGINT=1 -fexceptions") +set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS "-sUSE_ZLIB=1 -sWASM_BIGINT=1 -fexceptions") +set(CMAKE_SHARED_LINKER_FLAGS "-sUSE_ZLIB=1 -sWASM_BIGINT=1 -fexceptions") +set(CMAKE_STRIP FALSE) + +set(ENV{_PYTHON_SYSCONFIGDATA_NAME} $ENV{SYSCONFIG_NAME}) diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index a5f5659723c28..2564ff374ac71 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -24,7 +24,9 @@ include(CheckCXXSourceCompiles) message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}") if(NOT DEFINED ARROW_CPU_FLAG) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") + if(CMAKE_SYSTEM_NAME MATCHES "Emscripten") + set(ARROW_CPU_FLAG "emscripten") + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64") set(ARROW_CPU_FLAG "x86") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64") set(ARROW_CPU_FLAG "aarch64") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 85c0337d108be..667784ad2e0d2 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -955,6 +955,11 @@ set(EP_COMMON_CMAKE_ARGS -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT} -DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE}) +# if building with a toolchain file, pass that through +if(CMAKE_TOOLCHAIN_FILE) + list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}) +endif() + # Enable s/ccache if set by parent. if(CMAKE_C_COMPILER_LAUNCHER AND CMAKE_CXX_COMPILER_LAUNCHER) list(APPEND EP_COMMON_CMAKE_ARGS @@ -1614,6 +1619,9 @@ macro(build_thrift) if(DEFINED BOOST_ROOT) list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}") endif() + if(DEFINED Boost_INCLUDE_DIR) + list(APPEND THRIFT_CMAKE_ARGS "-DBoost_INCLUDE_DIR=${Boost_INCLUDE_DIR}") + endif() if(DEFINED Boost_NAMESPACE) list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}") endif() diff --git a/docs/source/developers/cpp/emscripten.rst b/docs/source/developers/cpp/emscripten.rst new file mode 100644 index 0000000000000..d62e71c7d497b --- /dev/null +++ b/docs/source/developers/cpp/emscripten.rst @@ -0,0 +1,100 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + + +.. highlight:: console .. _developers-cpp-emscripten: + +################################################# + Cross compiling for Webassembly with Emscripten +################################################# + +*************** + Prerequisites +*************** +You need cmake and compilers etc. installed as per the normal build instructions. Before building with emscripten, you also need to install emscripten and +activate it using the commands below (see https://emscripten.org/docs/getting_started/downloads.html for details). + +.. code:: shell + + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + # replace with the desired EMSDK version. + # e.g. for pyodide 0.24, you need EMSDK version 3.1.45 + ./emsdk install + ./emsdk activate + source ./emsdk_env.sh + +If you want to build pyarrow for `pyodide `_, you +need ``pyodide-build`` installed via ``pip``, and to be running with the +same version of python that pyodide is built for, along with the same +versions of emsdk. + +.. code:: shell + + # install pyodide build tools. + # e.g. for version 0.24 of pyodide: + pip install pyodide-build==0.24 + +Then build with the ``ninja-release-emscripten-python`` cmake preset, +like below: + +.. code:: shell + + cmake --preset "ninja-release-emscripten-python" + ninja install + +This will install a built static library version of libarrow it into the +emscripten sysroot cache, meaning you can build things that depend on it +and they will find libarrow. + +e.g. if you want to build for pyodide, run the commands above, and then +go to ``arrow/python`` and run + +.. code:: shell + + pyodide build + +It should make a wheel targeting the currently enabled version of +pyodide (i.e. the version corresponding to the currently installed +``pyodide-build``) in the ``dist`` subdirectory. + +************** + Manual Build +************** + +If you want to manually build for emscripten, take a look at the +CMakePresets.json file in the arrow/cpp directory for a list of things +you will need to override. In particular you will need: + +#. Build dependencies set to ``BUNDLED``, so it uses properly cross + compiled build dependencies. + +#. ``CMAKE_TOOLCHAIN_FILE`` set to + ``arrow/cpp/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake`` + +#. You will quite likely need to set ``ARROW_ENABLE_THREADING`` to ``OFF`` + for builds targeting single threaded emscripten environments such as + pyodide. + +#. ``ARROW_IPC`` and anything else that uses network probably won't + work. + +#. ``ARROW_JEMALLOC`` and ``ARROW_MIMALLOC`` again probably need to be + ``OFF`` + +#. ``ARROW_BUILD_STATIC`` set to ``ON`` and ``ARROW_BUILD_SHARED`` set to + ``OFF`` is most likely to work. diff --git a/docs/source/developers/cpp/index.rst b/docs/source/developers/cpp/index.rst index 36c9778bea1b0..603e1607dc543 100644 --- a/docs/source/developers/cpp/index.rst +++ b/docs/source/developers/cpp/index.rst @@ -27,5 +27,6 @@ C++ Development building development windows + emscripten conventions fuzzing diff --git a/python/.gitignore b/python/.gitignore index ce7f065412728..eba9f9102dee0 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -44,3 +44,6 @@ pyarrow/_table_api.h manylinux1/arrow nm_arrow.log visible_symbols.log + +# build override configuration +arrow_build_overrides.cfg \ No newline at end of file diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 242ba8448f4a6..6b8fdade7f2f3 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -68,6 +68,8 @@ if(POLICY CMP0095) cmake_policy(SET CMP0095 NEW) endif() +option(DUMP_ARROW_ARGUMENTS "Dump the arrow arguments then quit" OFF) + # Use the first Python installation on PATH, not the newest one set(Python3_FIND_STRATEGY "LOCATION") # On Windows, use registry last, not first @@ -260,6 +262,22 @@ include(GNUInstallDirs) find_package(Arrow REQUIRED) +if(DUMP_ARROW_ARGUMENTS) + message(STATUS "----- ARROW_SETTINGS_DUMP -----") + get_cmake_property(_variableNames VARIABLES) + list(SORT _variableNames) + foreach(_variableName ${_variableNames}) + unset(MATCHED) + string(REGEX MATCH ^ARROW_.* MATCHED ${_variableName}) + if(NOT MATCHED) + continue() + endif() + message(STATUS "${_variableName}=${${_variableName}}") + endforeach() + message(STATUS "----- ARROW_SETTINGS_END -----") + return() +endif() + set(PYARROW_CPP_ROOT_DIR pyarrow/src) set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python) set(PYARROW_CPP_SRCS @@ -570,28 +588,42 @@ endif() # Acero if(PYARROW_BUILD_ACERO) - if(PYARROW_BUNDLE_ARROW_CPP) - bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION}) - if(MSVC) - bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB}) + if(ARROW_BUILD_SHARED) + + if(PYARROW_BUNDLE_ARROW_CPP) + bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION}) + if(MSVC) + bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB}) + endif() endif() - endif() - set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared) - list(APPEND CYTHON_EXTENSIONS _acero) + set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared) + list(APPEND CYTHON_EXTENSIONS _acero) + else() + # ACERO is statically linked into libarrow_python already + set(ACERO_LINK_LIBS) + list(APPEND CYTHON_EXTENSIONS _acero) + endif() endif() # Dataset if(PYARROW_BUILD_DATASET) - if(PYARROW_BUNDLE_ARROW_CPP) - bundle_arrow_lib(${ARROW_DATASET_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION}) - if(MSVC) - bundle_arrow_import_lib(${ARROW_DATASET_IMPORT_LIB}) + if(ARROW_BUILD_SHARED) + + if(PYARROW_BUNDLE_ARROW_CPP) + bundle_arrow_lib(${ARROW_DATASET_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION}) + if(MSVC) + bundle_arrow_import_lib(${ARROW_DATASET_IMPORT_LIB}) + endif() endif() - endif() - set(DATASET_LINK_LIBS ArrowDataset::arrow_dataset_shared) + set(DATASET_LINK_LIBS ArrowDataset::arrow_dataset_shared) + else() + # dataset is sttically linked into libarrow_python already + set(DATASET_LINK_LIBS) + endif() list(APPEND CYTHON_EXTENSIONS _dataset) + endif() # Parquet @@ -612,7 +644,9 @@ if(PYARROW_BUILD_PARQUET) endif() set(PARQUET_LINK_LIBS Parquet::parquet_shared) else() - set(PARQUET_LINK_LIBS Parquet::parquet_static) + # parquet is linked into libarrow_python already + # so isn't needed in the extension + set(PARQUET_LINK_LIBS "") endif() list(APPEND CYTHON_EXTENSIONS _parquet) if(PYARROW_BUILD_PARQUET_ENCRYPTION) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 79bd270ce54d2..430ff064478d9 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -27,7 +27,7 @@ import warnings import pyarrow as pa from pyarrow.lib cimport * -from pyarrow.lib import frombytes, tobytes +from pyarrow.lib import frombytes, tobytes, is_threading_enabled from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_dataset cimport * from pyarrow.includes.libarrow_dataset_parquet cimport * @@ -687,6 +687,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): new CParquetFragmentScanOptions())) self.use_buffered_stream = use_buffered_stream self.buffer_size = buffer_size + if pre_buffer and not is_threading_enabled(): + pre_buffer=False self.pre_buffer = pre_buffer if thrift_string_size_limit is not None: self.thrift_string_size_limit = thrift_string_size_limit @@ -730,6 +732,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): @pre_buffer.setter def pre_buffer(self, bint pre_buffer): + if pre_buffer and not is_threading_enabled(): + return self.arrow_reader_properties().set_pre_buffer(pre_buffer) @property diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 50b4ed8e86e65..d691d1834081a 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -38,7 +38,7 @@ from pyarrow.lib cimport (_Weakrefable, Buffer, Schema, from pyarrow.lib import (ArrowException, NativeFile, BufferOutputStream, _stringify_path, - tobytes, frombytes) + tobytes, frombytes, is_threading_enabled) cimport cpython as cp @@ -1206,6 +1206,9 @@ cdef class ParquetReader(_Weakrefable): default_arrow_reader_properties()) FileReaderBuilder builder + if pre_buffer and not is_threading_enabled(): + pre_buffer=False + if metadata is not None: c_metadata = metadata.sp_metadata diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 4d109fc660e08..887cf70c05ea5 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -301,3 +301,6 @@ cdef extern from "arrow/python/benchmark.h" namespace "arrow::py::benchmark": cdef extern from "arrow/python/gdb.h" namespace "arrow::gdb" nogil: void GdbTestSession "arrow::gdb::TestSession"() + +cdef extern from "arrow/python/helpers.h" namespace "arrow::py::internal": + c_bool IsThreadingEnabled() diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 460e932b86273..b1ad757344497 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -32,7 +32,6 @@ from queue import Queue, Empty as QueueEmpty from pyarrow.util import _is_path_like, _stringify_path - # 64K DEFAULT_BUFFER_SIZE = 2 ** 16 @@ -677,7 +676,10 @@ cdef class NativeFile(_Weakrefable): self.seek(0) - writer_thread = threading.Thread(target=bg_write) + if is_threading_enabled(): + writer_thread = threading.Thread(target=bg_write) + else: + writer_thread = None # This isn't ideal -- PyBytes_FromStringAndSize copies the data from # the passed buffer, so it's hard for us to avoid doubling the memory @@ -685,8 +687,8 @@ cdef class NativeFile(_Weakrefable): if buf == NULL: raise MemoryError("Failed to allocate {0} bytes" .format(buffer_size)) - - writer_thread.start() + if writer_thread: + writer_thread.start() cdef int64_t total_bytes = 0 cdef int32_t c_buffer_size = buffer_size @@ -706,18 +708,23 @@ cdef class NativeFile(_Weakrefable): pybuf = cp.PyBytes_FromStringAndSize(buf, bytes_read) - if writer_thread.is_alive(): - while write_queue.full(): - time.sleep(0.01) - else: - break + if writer_thread is not None: + if writer_thread.is_alive(): + while write_queue.full(): + time.sleep(0.01) + else: + break - write_queue.put_nowait(pybuf) + write_queue.put_nowait(pybuf) + + else: + # no background thread + stream.write(pybuf) finally: free(buf) done = True - - writer_thread.join() + if writer_thread is not None: + writer_thread.join() if exc_info is not None: raise exc_info[0], exc_info[1], exc_info[2] @@ -753,8 +760,11 @@ cdef class NativeFile(_Weakrefable): except Exception as e: exc_info = sys.exc_info() - writer_thread = threading.Thread(target=bg_write) - writer_thread.start() + if is_threading_enabled(): + writer_thread = threading.Thread(target=bg_write) + writer_thread.start() + else: + writer_thread = None try: while True: @@ -762,17 +772,21 @@ cdef class NativeFile(_Weakrefable): if not buf: break - if writer_thread.is_alive(): - while write_queue.full(): - time.sleep(0.01) - else: - break + if writer_thread is not None: + if writer_thread.is_alive(): + while write_queue.full(): + time.sleep(0.01) + else: + break - write_queue.put_nowait(buf) + write_queue.put_nowait(buf) + else: + # no threading + self.write(buf) finally: done = True - - writer_thread.join() + if writer_thread is not None: + writer_thread.join() if exc_info is not None: raise exc_info[0], exc_info[1], exc_info[2] diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 57fb0f42e38bf..64d1035972747 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -79,6 +79,10 @@ def set_cpu_count(int count): check_status(SetCpuThreadPoolCapacity(count)) +def is_threading_enabled(): + return libarrow_python.IsThreadingEnabled() + + Type_NA = _Type_NA Type_BOOL = _Type_BOOL Type_UINT8 = _Type_UINT8 diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 4e5c868efd4c8..72ed0c4a965a1 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -33,7 +33,7 @@ import numpy as np import pyarrow as pa -from pyarrow.lib import _pandas_api, frombytes # noqa +from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa _logical_type_map = {} @@ -608,7 +608,7 @@ def _can_definitely_zero_copy(arr): arr.flags.contiguous and issubclass(arr.dtype.type, np.integer)) - if nthreads == 1: + if nthreads == 1 or not is_threading_enabled(): arrays = [convert_column(c, f) for c, f in zip(columns_to_convert, convert_fields)] else: diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index e0cdfee62ef4b..26d7ae900f219 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -52,6 +52,7 @@ from pyarrow.fs import (LocalFileSystem, FileSystem, FileType, _resolve_filesystem_and_path, _ensure_filesystem) from pyarrow import filesystem as legacyfs +from pyarrow.lib import is_threading_enabled from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api _URI_STRIP_SCHEMES = ('hdfs',) @@ -329,6 +330,9 @@ def __init__(self, source, *, metadata=None, common_metadata=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, filesystem=None): + if pre_buffer and not is_threading_enabled(): + pre_buffer = False + self._close_source = getattr(source, 'closed', True) filesystem, source = _resolve_filesystem_and_path( @@ -1494,8 +1498,9 @@ def __init__(self, dirpath, open_file_func=None, filesystem=None, self.partitions = ParquetPartitions() self.pieces = [] self._metadata_nthreads = metadata_nthreads - self._thread_pool = futures.ThreadPoolExecutor( - max_workers=metadata_nthreads) + if is_threading_enabled(): + self._thread_pool = futures.ThreadPoolExecutor( + max_workers=metadata_nthreads) self.common_metadata_path = None self.metadata_path = None @@ -1510,7 +1515,8 @@ def __init__(self, dirpath, open_file_func=None, filesystem=None, # _common_metadata is a subset of _metadata self.common_metadata_path = self.metadata_path - self._thread_pool.shutdown() + if is_threading_enabled(): + self._thread_pool.shutdown() def _visit_level(self, level, base_path, part_keys): fs = self.filesystem @@ -1562,7 +1568,7 @@ def _visit_directories(self, level, directories, part_keys): dir_part_keys = part_keys + [(name, index)] # If you have less threads than levels, the wait call will block # indefinitely due to multiple waits within a thread. - if level < self._metadata_nthreads: + if level < self._metadata_nthreads and is_threading_enabled(): future = self._thread_pool.submit(self._visit_level, level + 1, path, @@ -1775,6 +1781,9 @@ def __new__(cls, path_or_paths=None, filesystem=None, schema=None, thrift_container_size_limit=None): extra_msg = "" + if pre_buffer and not is_threading_enabled(): + pre_buffer = False + if use_legacy_dataset is None: # if an old filesystem is passed -> still use to old implementation if isinstance(filesystem, legacyfs.FileSystem): @@ -1822,6 +1831,10 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, coerce_int96_timestamp_unit=None, thrift_string_size_limit=None, thrift_container_size_limit=None): + + if pre_buffer and not is_threading_enabled(): + pre_buffer = False + if partitioning != "hive": raise ValueError( 'Only "hive" for hive-like partitioning is supported when ' @@ -2414,6 +2427,8 @@ def __init__(self, path_or_paths, filesystem=None, *, filters=None, thrift_container_size_limit=None, **kwargs): import pyarrow.dataset as ds + if pre_buffer and not is_threading_enabled(): + pre_buffer = False # Raise error for not supported keywords for keyword, default in [ @@ -2943,6 +2958,10 @@ def read_table(source, *, columns=None, use_threads=True, metadata=None, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None): + + if pre_buffer and not is_threading_enabled(): + pre_buffer = False + if not use_legacy_dataset: if metadata is not None: raise ValueError( diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index c266abc169d49..d444accad8bfa 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -29,6 +29,7 @@ #include "arrow/python/decimal.h" #include "arrow/type_fwd.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/config.h" #include "arrow/util/logging.h" namespace arrow { @@ -465,6 +466,14 @@ void DebugPrint(PyObject* obj) { PySys_WriteStderr("%s\n", repr.c_str()); } +bool IsThreadingEnabled() { +#ifdef ARROW_ENABLE_THREADING + return true; +#else + return false; +#endif +} + } // namespace internal } // namespace py } // namespace arrow diff --git a/python/pyarrow/src/arrow/python/helpers.h b/python/pyarrow/src/arrow/python/helpers.h index a8e5f80b60678..e2fd8212ae68d 100644 --- a/python/pyarrow/src/arrow/python/helpers.h +++ b/python/pyarrow/src/arrow/python/helpers.h @@ -154,6 +154,9 @@ Status IntegerScalarToFloat32Safe(PyObject* obj, float* result); // \brief Print Python object __repr__ void DebugPrint(PyObject* obj); +ARROW_PYTHON_EXPORT +bool IsThreadingEnabled(); + } // namespace internal } // namespace py } // namespace arrow diff --git a/python/setup.py b/python/setup.py index abd9d03cfb17e..ba3ea7a6fd791 100755 --- a/python/setup.py +++ b/python/setup.py @@ -23,7 +23,9 @@ from os.path import join as pjoin import re import shlex +import subprocess import sys +import tempfile if sys.version_info >= (3, 10): import sysconfig @@ -133,8 +135,69 @@ def run(self): 'bundle the Arrow C++ headers')] + _build_ext.user_options) + def get_arrow_build_options(self): + """ + read arrow options from cmake + """ + if hasattr(self, "_arrow_build_options"): + return self._arrow_build_options + self._arrow_build_options = {} + # first find the cmake file + source = os.path.dirname(os.path.abspath(__file__)) + # now make a temp folder to run cmake in + with tempfile.TemporaryDirectory() as td: + old_dir = os.getcwd() + os.chdir(td) + cmake_cmdline = ["cmake", source, "-DDUMP_ARROW_ARGUMENTS=ON"] + if sysconfig.get_config_var("SOABI").find("emscripten") != -1: + cmake_cmdline.append( + "-DCMAKE_TOOLCHAIN_FILE=" + + source + + "/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake" + ) + result = subprocess.run(cmake_cmdline, capture_output=True, text=True) + os.chdir(old_dir) + in_dump = False + for line in result.stdout.splitlines(): + if line.find("----- ARROW_SETTINGS_DUMP -----") != -1: + in_dump = True + if line.find("----- ARROW_SETTINGS_END -----") != -1: + break + if in_dump: + m = re.match(r"-- ([^=]*)=(.*)", line) + if m: + key = m.group(1) + value = m.group(2) + self._arrow_build_options[key] = value + return self._arrow_build_options + + def get_env_option(self, name, default): + """ + Get an option from environment variable. If the variable is not set, + a default is used based on arrow cmake options. + """ + if name in os.environ: + return strtobool(os.environ.get(name)) + else: + special_cases = { + "PYARROW_WITH_PARQUET_ENCRYPTION": "PARQUET_REQUIRE_ENCRYPTION"} + cmake_default_name = None + if name in special_cases: + cmake_default_name = special_cases[name] + elif name.startswith("PYARROW_WITH_"): + cmake_default_name = name.replace("PYARROW_WITH_", "ARROW_") + elif name.startswith("PYARROW_"): + cmake_default_name = name.replace("PYARROW_", "ARROW_") + if cmake_default_name is not None: + # get name from arrow cmake options + cmake_options = self.get_arrow_build_options() + return strtobool(cmake_options.get(cmake_default_name, default)) + else: + return strtobool(default) + def initialize_options(self): _build_ext.initialize_options(self) + self.cmake_generator = os.environ.get('PYARROW_CMAKE_GENERATOR') if not self.cmake_generator and sys.platform == 'win32': self.cmake_generator = 'Visual Studio 15 2017 Win64' @@ -150,36 +213,22 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' - self.with_gcs = strtobool( - os.environ.get('PYARROW_WITH_GCS', '0')) - self.with_s3 = strtobool( - os.environ.get('PYARROW_WITH_S3', '0')) - self.with_hdfs = strtobool( - os.environ.get('PYARROW_WITH_HDFS', '0')) - self.with_cuda = strtobool( - os.environ.get('PYARROW_WITH_CUDA', '0')) - self.with_substrait = strtobool( - os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) - self.with_flight = strtobool( - os.environ.get('PYARROW_WITH_FLIGHT', '0')) - self.with_acero = strtobool( - os.environ.get('PYARROW_WITH_ACERO', '0')) - self.with_dataset = strtobool( - os.environ.get('PYARROW_WITH_DATASET', '0')) - self.with_parquet = strtobool( - os.environ.get('PYARROW_WITH_PARQUET', '0')) - self.with_parquet_encryption = strtobool( - os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0')) - self.with_orc = strtobool( - os.environ.get('PYARROW_WITH_ORC', '0')) - self.with_gandiva = strtobool( - os.environ.get('PYARROW_WITH_GANDIVA', '0')) - self.generate_coverage = strtobool( - os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) - self.bundle_arrow_cpp = strtobool( - os.environ.get('PYARROW_BUNDLE_ARROW_CPP', '0')) - self.bundle_cython_cpp = strtobool( - os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) + self.with_gcs = self.get_env_option('PYARROW_WITH_GCS', '0') + self.with_s3 = self.get_env_option('PYARROW_WITH_S3', '0') + self.with_hdfs = self.get_env_option('PYARROW_WITH_HDFS', '0') + self.with_cuda = self.get_env_option('PYARROW_WITH_CUDA', '0') + self.with_substrait = self.get_env_option('PYARROW_WITH_SUBSTRAIT', '0') + self.with_flight = self.get_env_option('PYARROW_WITH_FLIGHT', '0') + self.with_acero = self.get_env_option('PYARROW_WITH_ACERO', '0') + self.with_dataset = self.get_env_option('PYARROW_WITH_DATASET', '0') + self.with_parquet = self.get_env_option('PYARROW_WITH_PARQUET', '0') + self.with_parquet_encryption = self.get_env_option( + 'PYARROW_WITH_PARQUET_ENCRYPTION', '0') + self.with_orc = self.get_env_option('PYARROW_WITH_ORC', '0') + self.with_gandiva = self.get_env_option('PYARROW_WITH_GANDIVA', '0') + self.generate_coverage = self.get_env_option('PYARROW_GENERATE_COVERAGE', '0') + self.bundle_arrow_cpp = self.get_env_option('PYARROW_BUNDLE_ARROW_CPP', '0') + self.bundle_cython_cpp = self.get_env_option('PYARROW_BUNDLE_CYTHON_CPP', '0') self.with_parquet_encryption = (self.with_parquet_encryption and self.with_parquet) @@ -306,6 +355,13 @@ def append_cmake_bool(value, varname): if parallel: build_tool_args.append(f'-j{parallel}') + if sysconfig.get_config_var("SOABI").find("emscripten") != -1: + cmake_options.append( + "-DCMAKE_TOOLCHAIN_FILE=" + + source + + "/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake" + ) + # Generate the build files print("-- Running cmake for PyArrow") self.spawn(['cmake'] + extra_cmake_args + cmake_options + [source])