From 090d52f878999e9f11302e97d594cb6cc63cac98 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 7 Jul 2021 08:55:47 -0500 Subject: [PATCH 1/6] add options to build Arrow with Python and Parquet support --- cpp/CMakeLists.txt | 4 ++- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 31 ++++++++++++++++++------ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d0eabd1e5cd..a4def162717 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -44,6 +44,8 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON) option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) +option(CUDF_USE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) +option(CUDF_USE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF) option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON) option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF) @@ -272,7 +274,7 @@ add_library(cudf src/join/join.cu src/join/semi_join.cu src/lists/contains.cu - src/lists/combine/concatenate_list_elements.cu + src/lists/combine/concatenate_list_elements.cu src/lists/combine/concatenate_rows.cu src/lists/copying/concatenate.cu src/lists/copying/copying.cu diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index e15f3f7e16d..17a89d6ed0a 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -14,11 +14,10 @@ # limitations under the License. #============================================================================= -function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) +function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 WITH_PYTHON WITH_PARQUET) set(ARROW_BUILD_SHARED ON) set(ARROW_BUILD_STATIC OFF) - set(ARROW_BUILD_S3 OFF) set(CPMAddOrFindPackage CPMFindPackage) if(NOT ARROW_ARMV8_ARCH) @@ -36,8 +35,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) set(CPMAddOrFindPackage CPMAddPackage) endif() - if(ENABLE_S3) - set(ARROW_BUILD_S3 ON) + set(ARROW_PYTHON_OPTIONS "") + if(WITH_PYTHON) + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") + # Arrow's logic to find Thrift is busted, so we have to build it from + # source. Why can't we use `THRIFT_SOURCE BUNDLED` you might ask? + # Because that's _also_ busted. The only thing that seems to is to set + # _all_ dependencies to bundled, then optionall un-set BOOST_SOURCE to + # SYSTEM. + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED") endif() cmake_language(CALL ${CPMAddOrFindPackage} @@ -55,7 +63,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) "ARROW_WITH_BACKTRACE ON" "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" - "ARROW_S3 ${ARROW_BUILD_S3}" + "ARROW_S3 ${ENABLE_S3}" + # e.g. needed by blazingsql-io + "ARROW_PARQUET ${WITH_PARQUET}" + ${ARROW_PYTHON_OPTIONS} # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" @@ -104,7 +115,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3) # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` # and `arrow_shared` targets in FindArrow and FindArrowCUDA respectively, # so for static source-builds, we have to do it after-the-fact. - # + # # This only works because we know exactly which components we're using. # Don't forget to update this list if we add more! ### @@ -129,4 +140,10 @@ endfunction() set(CUDF_VERSION_Arrow 4.0.1) -find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3}) +find_and_configure_arrow( + ${CUDF_VERSION_Arrow} + ${CUDF_USE_ARROW_STATIC} + ${CUDF_ENABLE_ARROW_S3} + ${CUDF_USE_ARROW_PYTHON} + ${CUDF_USE_ARROW_PARQUET} +) From 6d5d351a58e387f561aa9a1236197e441f385b27 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 7 Jul 2021 09:00:31 -0500 Subject: [PATCH 2/6] copy parquet_version.h --- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 17a89d6ed0a..83ce50f25e5 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -109,6 +109,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 WITH_PYTHON WIT DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util") file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu") + if(WITH_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet") + endif() ### # This shouldn't be necessary! # From c59237bc87b5dd708fe79be5e01e9f0e61826c88 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 7 Jul 2021 09:07:20 -0500 Subject: [PATCH 3/6] add CUDA_TOOLKIT_ROOT_DIR for Arrow v4.0.1 CMake changes --- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 83ce50f25e5..d0c1c194972 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -56,6 +56,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 WITH_PYTHON WIT GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" + "CUDA_TOOLKIT_ROOT_DIR ${CUDAToolkit_LIBRARY_ROOT}" "CUDA_USE_STATIC_CUDA_RUNTIME ${CUDA_STATIC_RUNTIME}" "ARROW_IPC ON" "ARROW_CUDA ON" From d0d8db58321efe8cc2711eaf5f08e6c77ace73f8 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 7 Jul 2021 09:26:23 -0500 Subject: [PATCH 4/6] rename CUDF_USE_ARROW_* -> CUDF_ENABLE_ARROW_* --- cpp/CMakeLists.txt | 4 ++-- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a4def162717..b3b0de86ae0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -44,8 +44,8 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON) option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) -option(CUDF_USE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) -option(CUDF_USE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF) +option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) +option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF) option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON) option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF) diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index d0c1c194972..3cc3ac33720 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -14,7 +14,7 @@ # limitations under the License. #============================================================================= -function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 WITH_PYTHON WITH_PARQUET) +function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON ENABLE_PARQUET) set(ARROW_BUILD_SHARED ON) set(ARROW_BUILD_STATIC OFF) @@ -36,7 +36,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 WITH_PYTHON WIT endif() set(ARROW_PYTHON_OPTIONS "") - if(WITH_PYTHON) + if(ENABLE_PYTHON) list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") # Arrow's logic to build Boost from source is busted, so we have to get it from the system. list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") @@ -66,7 +66,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 WITH_PYTHON WIT "ARROW_JEMALLOC OFF" "ARROW_S3 ${ENABLE_S3}" # e.g. needed by blazingsql-io - "ARROW_PARQUET ${WITH_PARQUET}" + "ARROW_PARQUET ${ENABLE_PARQUET}" ${ARROW_PYTHON_OPTIONS} # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" @@ -110,7 +110,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 WITH_PYTHON WIT DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util") file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu") - if(WITH_PARQUET) + if(ENABLE_PARQUET) file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet") endif() @@ -149,6 +149,6 @@ find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} - ${CUDF_USE_ARROW_PYTHON} - ${CUDF_USE_ARROW_PARQUET} + ${CUDF_ENABLE_ARROW_PYTHON} + ${CUDF_ENABLE_ARROW_PARQUET} ) From ca3c12bfc8257f02f04dd5fe58caa0e86af480ce Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 7 Jul 2021 15:31:21 -0500 Subject: [PATCH 5/6] set CUDA_LIB_PATH for Arrow's use of legacy FindCUDA when not building with nvidia-container-runtime --- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index 3cc3ac33720..d55aaa03cd2 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -48,6 +48,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED") endif() + # Set this so Arrow correctly finds the CUDA toolkit. + # * This must be an ENV var. + # * It must be a path to /lib64/stubs. + # Anything else and Arrow can't find CUDA + set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs") + cmake_language(CALL ${CPMAddOrFindPackage} NAME Arrow VERSION ${VERSION} @@ -56,7 +62,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" - "CUDA_TOOLKIT_ROOT_DIR ${CUDAToolkit_LIBRARY_ROOT}" "CUDA_USE_STATIC_CUDA_RUNTIME ${CUDA_STATIC_RUNTIME}" "ARROW_IPC ON" "ARROW_CUDA ON" From b06fbfd7d4e632903e67db4e916f12265b533ab7 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 7 Jul 2021 16:09:48 -0500 Subject: [PATCH 6/6] add comment clarifying CUDA_LIB_PATH --- cpp/cmake/thirdparty/CUDF_GetArrow.cmake | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake index d55aaa03cd2..8cef3e8b9d0 100644 --- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake +++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake @@ -48,10 +48,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED") endif() - # Set this so Arrow correctly finds the CUDA toolkit. - # * This must be an ENV var. - # * It must be a path to /lib64/stubs. - # Anything else and Arrow can't find CUDA + # Set this so Arrow correctly finds the CUDA toolkit when the build machine + # does not have the CUDA driver installed. This must be an env var. set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs") cmake_language(CALL ${CPMAddOrFindPackage}