Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-23221: [C++] webassembly / pyodide / emscripten build support #37696

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,32 @@
"CMAKE_BUILD_TYPE": "RelWithDebInfo"
}
},
{
"name": "emscripten-overrides",
"hidden": true,
"cacheVariables": {
"ARROW_BUILD_SHARED": "OFF",
"ARROW_BUILD_STATIC": "ON",
"ARROW_BUILD_TESTS": "OFF",
"ARROW_ENABLE_THREADING": "OFF",
"ARROW_CUDA": "OFF",
"ARROW_MIMALLOC": "OFF",
"ARROW_JEMALLOC": "OFF",
"ARROW_S3": "OFF",
"ARROW_DEPENDENCY_SOURCE": "BUNDLED",
"ZLIB_SOURCE": "SYSTEM",
"ARROW_IPC": "OFF",
"ARROW_ORC": "OFF",
"ARROW_SUBSTRAIT": "OFF",
"ARROW_DEPENDENCY_USE_SHARED": "OFF",
"ARROW_WITH_BROTLI": "OFF",
"ARROW_SIMD_LEVEL":"NONE",
"ARROW_RUNTIME_SIMD_LEVEL":"NONE",
"CMAKE_C_BYTE_ORDER":"LITTLE_ENDIAN",
"ARROW_WITH_OPENTELEMETRY":"OFF",
"CMAKE_TOOLCHAIN_FILE": { "type": "PATH", "value": "${sourceDir}/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake" }
}
},
{
"name": "features-minimal",
"hidden": true,
Expand Down Expand Up @@ -395,6 +421,16 @@
"displayName": "Release build for PyArrow with everything enabled",
"cacheVariables": {}
},
{
"name": "ninja-release-emscripten-python",
"inherits": [
"emscripten-overrides",
"base-release",
"features-python"
],
"displayName": "Release build which builds an emscripten library, plus PyArrow for Pyodide",
"cacheVariables": {}
},
{
"name": "ninja-release-maximal",
"inherits": [
Expand Down
56 changes: 56 additions & 0 deletions cpp/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Force some variables for emscripten
# to disable things that won't work there

# make us be on the platforms list for cmake
get_filename_component(PLATFORM_FOLDER_PARENT ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
list(APPEND CMAKE_MODULE_PATH "${PLATFORM_FOLDER_PARENT}")

include($ENV{EMSDK}/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake)

# ensure zlib is built with -fpic
# and force us to link to the version in emscripten ports
if(NOT EXISTS ${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a)
execute_process(COMMAND embuilder --pic --force build zlib)
endif()
set(ZLIB_LIBRARY ${EMSCRIPTEN_SYSROOT}/lib/wasm32-emscripten/pic/libz.a)

# # override default in emscripten which is to not use shared libs
set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS TRUE)

# if we leave the system name as Emscripten, then it reloads the original Emscripten.cmake every time a project() command
# is run, which does bad things like disabling shared libraries
set(CMAKE_SYSTEM_NAME EmscriptenOverrides)

set(CMAKE_C_FLAGS "-sUSE_ZLIB=1 -sSIDE_MODULE=1 -fPIC -fexceptions")
set(CMAKE_CXX_FLAGS "-sUSE_ZLIB=1 -sSIDE_MODULE=1 -fPIC -fexceptions")

#set(PYARROW_CPP_HOME "$ENV{ARROW_HOME}/lib")
#list(APPEND CMAKE_FIND_ROOT_PATH "${CMAKE_INSTALL_PREFIX}/cmake")

set(Python3_INCLUDE_DIR $ENV{PYTHONINCLUDE})
set(Python3_LIBRARY $ENV{CPYTHONLIB})
set(Python3_NumPy_INCLUDE_DIR $ENV{NUMPY_LIB}/core/include)
set(Python3_EXECUTABLE)
set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-sUSE_ZLIB=1 -sWASM_BIGINT=1 -fexceptions")
set(CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS "-sUSE_ZLIB=1 -sWASM_BIGINT=1 -fexceptions")
set(CMAKE_SHARED_LINKER_FLAGS "-sUSE_ZLIB=1 -sWASM_BIGINT=1 -fexceptions")
set(CMAKE_STRIP FALSE)

set(ENV{_PYTHON_SYSCONFIGDATA_NAME} $ENV{SYSCONFIG_NAME})
4 changes: 3 additions & 1 deletion cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ include(CheckCXXSourceCompiles)
message(STATUS "System processor: ${CMAKE_SYSTEM_PROCESSOR}")

if(NOT DEFINED ARROW_CPU_FLAG)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64")
if(CMAKE_SYSTEM_NAME MATCHES "Emscripten")
set(ARROW_CPU_FLAG "emscripten")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64|amd64|X86|x86|i[3456]86|x64")
set(ARROW_CPU_FLAG "x86")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64|arm64")
set(ARROW_CPU_FLAG "aarch64")
Expand Down
8 changes: 8 additions & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,11 @@ set(EP_COMMON_CMAKE_ARGS
-DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
-DCMAKE_VERBOSE_MAKEFILE=${CMAKE_VERBOSE_MAKEFILE})

# if building with a toolchain file, pass that through
if(CMAKE_TOOLCHAIN_FILE)
list(APPEND EP_COMMON_CMAKE_ARGS -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE})
endif()

# Enable s/ccache if set by parent.
if(CMAKE_C_COMPILER_LAUNCHER AND CMAKE_CXX_COMPILER_LAUNCHER)
list(APPEND EP_COMMON_CMAKE_ARGS
Expand Down Expand Up @@ -1614,6 +1619,9 @@ macro(build_thrift)
if(DEFINED BOOST_ROOT)
list(APPEND THRIFT_CMAKE_ARGS "-DBOOST_ROOT=${BOOST_ROOT}")
endif()
if(DEFINED Boost_INCLUDE_DIR)
list(APPEND THRIFT_CMAKE_ARGS "-DBoost_INCLUDE_DIR=${Boost_INCLUDE_DIR}")
endif()
if(DEFINED Boost_NAMESPACE)
list(APPEND THRIFT_CMAKE_ARGS "-DBoost_NAMESPACE=${Boost_NAMESPACE}")
endif()
Expand Down
100 changes: 100 additions & 0 deletions docs/source/developers/cpp/emscripten.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
.. Licensed to the Apache Software Foundation (ASF) under one
.. or more contributor license agreements. See the NOTICE file
.. distributed with this work for additional information
.. regarding copyright ownership. The ASF licenses this file
.. to you under the Apache License, Version 2.0 (the
.. "License"); you may not use this file except in compliance
.. with the License. You may obtain a copy of the License at

.. http://www.apache.org/licenses/LICENSE-2.0

.. Unless required by applicable law or agreed to in writing,
.. software distributed under the License is distributed on an
.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
.. KIND, either express or implied. See the License for the
.. specific language governing permissions and limitations
.. under the License.


.. highlight:: console .. _developers-cpp-emscripten:

#################################################
Cross compiling for Webassembly with Emscripten
#################################################

***************
Prerequisites
***************
You need cmake and compilers etc. installed as per the normal build instructions. Before building with emscripten, you also need to install emscripten and
activate it using the commands below (see https://emscripten.org/docs/getting_started/downloads.html for details).

.. code:: shell

git clone https://github.com/emscripten-core/emsdk.git
cd emsdk
# replace <version> with the desired EMSDK version.
# e.g. for pyodide 0.24, you need EMSDK version 3.1.45
./emsdk install <version>
./emsdk activate <version>
source ./emsdk_env.sh

If you want to build pyarrow for `pyodide <https://pyodide.org>`_, you
need ``pyodide-build`` installed via ``pip``, and to be running with the
same version of python that pyodide is built for, along with the same
versions of emsdk.

.. code:: shell

# install pyodide build tools.
# e.g. for version 0.24 of pyodide:
pip install pyodide-build==0.24

Then build with the ``ninja-release-emscripten-python`` cmake preset,
like below:

.. code:: shell

cmake --preset "ninja-release-emscripten-python"
ninja install

This will install a built static library version of libarrow it into the
emscripten sysroot cache, meaning you can build things that depend on it
and they will find libarrow.

e.g. if you want to build for pyodide, run the commands above, and then
go to ``arrow/python`` and run

.. code:: shell

pyodide build

It should make a wheel targeting the currently enabled version of
pyodide (i.e. the version corresponding to the currently installed
``pyodide-build``) in the ``dist`` subdirectory.

**************
Manual Build
**************

If you want to manually build for emscripten, take a look at the
CMakePresets.json file in the arrow/cpp directory for a list of things
you will need to override. In particular you will need:

#. Build dependencies set to ``BUNDLED``, so it uses properly cross
compiled build dependencies.

#. ``CMAKE_TOOLCHAIN_FILE`` set to
``arrow/cpp/cmake_modules/Emscripten/Platform/EmscriptenOverrides.cmake``

#. You will quite likely need to set ``ARROW_ENABLE_THREADING`` to ``OFF``
for builds targeting single threaded emscripten environments such as
pyodide.

#. ``ARROW_IPC`` and anything else that uses network probably won't
work.

#. ``ARROW_JEMALLOC`` and ``ARROW_MIMALLOC`` again probably need to be
``OFF``

#. ``ARROW_BUILD_STATIC`` set to ``ON`` and ``ARROW_BUILD_SHARED`` set to
``OFF`` is most likely to work.
1 change: 1 addition & 0 deletions docs/source/developers/cpp/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ C++ Development
building
development
windows
emscripten
conventions
fuzzing
3 changes: 3 additions & 0 deletions python/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ pyarrow/_table_api.h
manylinux1/arrow
nm_arrow.log
visible_symbols.log

# build override configuration
arrow_build_overrides.cfg
62 changes: 48 additions & 14 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ if(POLICY CMP0095)
cmake_policy(SET CMP0095 NEW)
endif()

option(DUMP_ARROW_ARGUMENTS "Dump the arrow arguments then quit" OFF)

# Use the first Python installation on PATH, not the newest one
set(Python3_FIND_STRATEGY "LOCATION")
# On Windows, use registry last, not first
Expand Down Expand Up @@ -260,6 +262,22 @@ include(GNUInstallDirs)

find_package(Arrow REQUIRED)

if(DUMP_ARROW_ARGUMENTS)
message(STATUS "----- ARROW_SETTINGS_DUMP -----")
get_cmake_property(_variableNames VARIABLES)
list(SORT _variableNames)
foreach(_variableName ${_variableNames})
unset(MATCHED)
string(REGEX MATCH ^ARROW_.* MATCHED ${_variableName})
if(NOT MATCHED)
continue()
endif()
message(STATUS "${_variableName}=${${_variableName}}")
endforeach()
message(STATUS "----- ARROW_SETTINGS_END -----")
return()
endif()

set(PYARROW_CPP_ROOT_DIR pyarrow/src)
set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python)
set(PYARROW_CPP_SRCS
Expand Down Expand Up @@ -570,28 +588,42 @@ endif()

# Acero
if(PYARROW_BUILD_ACERO)
if(PYARROW_BUNDLE_ARROW_CPP)
bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
if(MSVC)
bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
if(ARROW_BUILD_SHARED)

if(PYARROW_BUNDLE_ARROW_CPP)
bundle_arrow_lib(${ARROW_ACERO_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
if(MSVC)
bundle_arrow_import_lib(${ARROW_ACERO_IMPORT_LIB})
endif()
endif()
endif()

set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
list(APPEND CYTHON_EXTENSIONS _acero)
set(ACERO_LINK_LIBS ArrowAcero::arrow_acero_shared)
list(APPEND CYTHON_EXTENSIONS _acero)
else()
# ACERO is statically linked into libarrow_python already
set(ACERO_LINK_LIBS)
list(APPEND CYTHON_EXTENSIONS _acero)
endif()
endif()

# Dataset
if(PYARROW_BUILD_DATASET)
if(PYARROW_BUNDLE_ARROW_CPP)
bundle_arrow_lib(${ARROW_DATASET_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
if(MSVC)
bundle_arrow_import_lib(${ARROW_DATASET_IMPORT_LIB})
if(ARROW_BUILD_SHARED)

if(PYARROW_BUNDLE_ARROW_CPP)
bundle_arrow_lib(${ARROW_DATASET_SHARED_LIB} SO_VERSION ${ARROW_SO_VERSION})
if(MSVC)
bundle_arrow_import_lib(${ARROW_DATASET_IMPORT_LIB})
endif()
endif()
endif()

set(DATASET_LINK_LIBS ArrowDataset::arrow_dataset_shared)
set(DATASET_LINK_LIBS ArrowDataset::arrow_dataset_shared)
else()
# dataset is sttically linked into libarrow_python already
set(DATASET_LINK_LIBS)
endif()
list(APPEND CYTHON_EXTENSIONS _dataset)

endif()

# Parquet
Expand All @@ -612,7 +644,9 @@ if(PYARROW_BUILD_PARQUET)
endif()
set(PARQUET_LINK_LIBS Parquet::parquet_shared)
else()
set(PARQUET_LINK_LIBS Parquet::parquet_static)
# parquet is linked into libarrow_python already
# so isn't needed in the extension
set(PARQUET_LINK_LIBS "")
endif()
list(APPEND CYTHON_EXTENSIONS _parquet)
if(PYARROW_BUILD_PARQUET_ENCRYPTION)
Expand Down
6 changes: 5 additions & 1 deletion python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import warnings

import pyarrow as pa
from pyarrow.lib cimport *
from pyarrow.lib import frombytes, tobytes
from pyarrow.lib import frombytes, tobytes, is_threading_enabled
from pyarrow.includes.libarrow cimport *
from pyarrow.includes.libarrow_dataset cimport *
from pyarrow.includes.libarrow_dataset_parquet cimport *
Expand Down Expand Up @@ -687,6 +687,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
new CParquetFragmentScanOptions()))
self.use_buffered_stream = use_buffered_stream
self.buffer_size = buffer_size
if pre_buffer and not is_threading_enabled():
pre_buffer=False
self.pre_buffer = pre_buffer
if thrift_string_size_limit is not None:
self.thrift_string_size_limit = thrift_string_size_limit
Expand Down Expand Up @@ -730,6 +732,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):

@pre_buffer.setter
def pre_buffer(self, bint pre_buffer):
if pre_buffer and not is_threading_enabled():
return
self.arrow_reader_properties().set_pre_buffer(pre_buffer)

@property
Expand Down
Loading
Loading