Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-2034: [C++] Filesystem implementation for Azure Blob Storage #12914

Closed
wants to merge 38 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
b60c7f4
ARROW-2034: [C++] Filesystem implementation for AzureBlobFileSystem
Apr 18, 2022
1e2d0a3
ARROW-2034: [C++] Fixed formatting issues
Apr 21, 2022
d3cffa2
ARROW-2034: [C++] Fixed formatting issues
Apr 21, 2022
af13444
Added -DARROW_AZURE in ci
May 1, 2022
1026e15
Added CXX_STANDARD and CXX_STANDARD_REQUIRED
May 9, 2022
5f8b82a
Added mocked test file
May 26, 2022
b53a834
Merge remote-tracking branch 'upstream/master' into ARROW-2034-azurefs
May 26, 2022
5bd8210
Turned -DARROW_AZURE=OFF in appveyor-cpp-build
May 26, 2022
eead673
Changed default C++ version
Jun 1, 2022
f99fad5
Changed LibXml2 target
Jun 1, 2022
e2008d8
Fixing CMake styling issues
Jun 2, 2022
bb49f62
Enabling ARROW_AZURE flag
Jun 3, 2022
323b394
Added OpenSSL dependency
Jun 6, 2022
95cc602
Disabling ARROW_AZURE in windows-mingw
Jun 15, 2022
9350b4c
Fixing lint issues
Jun 15, 2022
9cd1a1a
Fixing azurefs_test
Jun 15, 2022
8ba75ae
Added Azurite
Jun 25, 2022
ca9a6fc
Added azurefs_objlib
Jun 25, 2022
f067ba9
Reverting azure object library changes
Jun 26, 2022
1f26725
Added permissions to install_azurite.sh
Jun 27, 2022
c16f853
chmod +x ci/scripts/install_azurite.sh
kou Jun 28, 2022
14267c2
Don't specify CMAKE_CXX_STANDARD by default
kou Jun 28, 2022
11ce11f
Fix system detection
kou Jun 28, 2022
a428a2b
Fix syntax
kou Jun 28, 2022
a62d104
Fix style
kou Jun 28, 2022
488e223
Fix style
kou Jun 28, 2022
3831a88
Running azurite through boost::process
Jun 28, 2022
8248c48
Fixed naming in azurefs_test.cc
Jul 14, 2022
dcd6e30
Fixed naming in azurefs.cc
Jul 19, 2022
b15a6b1
Fixed OpenOutputStream
Jul 26, 2022
a06c480
Merge remote-tracking branch 'upstream/master' into ARROW-2034-azurefs
Jul 26, 2022
a40a316
Added uri.Parse()
Aug 3, 2022
8600b6b
Updated versions.txt
Aug 28, 2022
18dc625
Merge remote-tracking branch 'upstream/master' into ARROW-2034-azurefs
Aug 28, 2022
b532701
Fixed ARROW_AZURE_STORAGE_BLOBS_URL
Aug 29, 2022
200592b
Added libxml2-dev
Aug 29, 2022
fe5b311
Merge remote-tracking branch 'upstream/master' into ARROW-2034-azurefs
Oct 2, 2022
3ea2d7f
Fixed build errors
Oct 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -693,7 +693,9 @@ endif()

# Libraries to link statically with libarrow.so
set(ARROW_LINK_LIBS)
set(ARROW_AZURE_LINK_LIBS)
set(ARROW_STATIC_LINK_LIBS)
set(ARROW_AZURE_STATIC_LINK_LIBS)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you use ARROW_LINK_LIBS and ARROW_STATIC_LINK_LIBS instead of add new ARROW_AZURE_* variables like S3 and GCS?

set(ARROW_STATIC_INSTALL_INTERFACE_LIBS)

if(ARROW_USE_OPENSSL)
Expand Down Expand Up @@ -792,6 +794,11 @@ if(ARROW_WITH_OPENTELEMETRY)
opentelemetry-cpp::otlp_http_exporter)
endif()

if (ARROW_AZURE)
list(APPEND ARROW_AZURE_LINK_LIBS ${AZURESDK_LINK_LIBRARIES})
list(APPEND ARROW_AZURE_STATIC_LINK_LIBS ${AZURESDK_LINK_LIBRARIES})
endif()

if(ARROW_WITH_UTF8PROC)
list(APPEND ARROW_LINK_LIBS utf8proc::utf8proc)
list(APPEND ARROW_STATIC_LINK_LIBS utf8proc::utf8proc)
Expand Down Expand Up @@ -829,6 +836,7 @@ if(ARROW_STATIC_LINK_LIBS)
endif()

set(ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_STATIC_LINK_LIBS})
set(ARROW_AZURE_SHARED_PRIVATE_LINK_LIBS ${ARROW_AZURE_STATIC_LINK_LIBS})

# boost::filesystem is needed for S3 and Flight tests as a boost::process dependency.
if(((ARROW_FLIGHT
Expand Down
254 changes: 254 additions & 0 deletions cpp/cmake_modules/BuildUtils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,260 @@ if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
list(APPEND ARROW_BOOST_PROCESS_COMPILE_DEFINITIONS "BOOST_USE_WINDOWS_H=1")
endif()

function(ADD_ARROW_LIB_AZURE LIB_NAME)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use add_arrow_lib instead of defining this?

set(options)
set(one_value_args
BUILD_SHARED
BUILD_STATIC)
set(multi_value_args
SOURCES
STATIC_LINK_LIBS
SHARED_LINK_LIBS
DEPENDENCIES
SHARED_PRIVATE_LINK_LIBS
OUTPUT_PATH)
cmake_parse_arguments(ARG
"${options}"
"${one_value_args}"
"${multi_value_args}"
${ARGN})
if(ARG_UNPARSED_ARGUMENTS)
message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}")
endif()

if(ARG_SOURCES)
set(SOURCES ${ARG_SOURCES})
else()
set(SOURCES "${LIB_NAME}.cc")
endif()

# Allow overriding ARROW_BUILD_SHARED and ARROW_BUILD_STATIC
if(DEFINED ARG_BUILD_SHARED)
set(BUILD_SHARED ${ARG_BUILD_SHARED})
else()
set(BUILD_SHARED ${ARROW_BUILD_SHARED})
endif()
if(DEFINED ARG_BUILD_STATIC)
set(BUILD_STATIC ${ARG_BUILD_STATIC})
else()
set(BUILD_STATIC ${ARROW_BUILD_STATIC})
endif()
if(ARG_OUTPUT_PATH)
set(OUTPUT_PATH ${ARG_OUTPUT_PATH})
else()
set(OUTPUT_PATH ${BUILD_OUTPUT_ROOT_DIRECTORY})
endif()

if(WIN32 OR (CMAKE_GENERATOR STREQUAL Xcode))
# We need to compile C++ separately for each library kind (shared and static)
# because of dllexport declarations on Windows.
# The Xcode generator doesn't reliably work with Xcode as target names are not
# guessed correctly.
set(USE_OBJLIB OFF)
else()
set(USE_OBJLIB ON)
endif()

if(USE_OBJLIB)
# Generate a single "objlib" from all C++ modules and link
# that "objlib" into each library kind, to avoid compiling twice
add_library(${LIB_NAME}_objlib OBJECT ${SOURCES})
# Necessary to make static linking into other shared libraries work properly
set_property(TARGET ${LIB_NAME}_objlib PROPERTY POSITION_INDEPENDENT_CODE 1)
set(LIB_DEPS $<TARGET_OBJECTS:${LIB_NAME}_objlib>)
else()
set(LIB_DEPS ${ARG_SOURCES})
endif()

set(RUNTIME_INSTALL_DIR bin)

if(BUILD_SHARED)
add_library(${LIB_NAME}_shared SHARED ${LIB_DEPS})

set_target_properties(${LIB_NAME}_shared
PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OUTPUT_PATH}"
RUNTIME_OUTPUT_DIRECTORY "${OUTPUT_PATH}"
PDB_OUTPUT_DIRECTORY "${OUTPUT_PATH}"
OUTPUT_NAME ${LIB_NAME}
VERSION "${ARROW_FULL_SO_VERSION}"
SOVERSION "${ARROW_SO_VERSION}")

target_link_libraries(${LIB_NAME}_shared LINK_PRIVATE ${ARG_SHARED_PRIVATE_LINK_LIBS})

install(TARGETS ${LIB_NAME}_shared
EXPORT ${LIB_NAME}_targets
RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
INCLUDES
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()

if(BUILD_STATIC)
add_library(${LIB_NAME}_static SHARED ${LIB_DEPS})

if(MSVC_TOOLCHAIN)
set(LIB_NAME_STATIC ${LIB_NAME}_static)
else()
set(LIB_NAME_STATIC ${LIB_NAME})
endif()

if(ARROW_BUILD_STATIC AND WIN32)
target_compile_definitions(${LIB_NAME}_static PUBLIC ARROW_STATIC)
endif()

set_target_properties(${LIB_NAME}_static
PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${OUTPUT_PATH}"
OUTPUT_NAME ${LIB_NAME_STATIC})

install(TARGETS ${LIB_NAME}_static
EXPORT ${LIB_NAME}_targets
RUNTIME DESTINATION ${RUNTIME_INSTALL_DIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
INCLUDES
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
endfunction()

function(ADD_TEST_CASE_AZURE REL_TEST_NAME)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use add_test_case instead of defining this?

set(options NO_VALGRIND ENABLED)
set(one_value_args PRECOMPILED_HEADER_LIB)
set(multi_value_args
SOURCES
PRECOMPILED_HEADERS
STATIC_LINK_LIBS
EXTRA_LINK_LIBS
EXTRA_INCLUDES
EXTRA_DEPENDENCIES
LABELS
EXTRA_LABELS
TEST_ARGUMENTS
PREFIX)
cmake_parse_arguments(ARG
"${options}"
"${one_value_args}"
"${multi_value_args}"
${ARGN})
if(ARG_UNPARSED_ARGUMENTS)
message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}")
endif()

if(NO_TESTS AND NOT ARG_ENABLED)
return()
endif()
get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE)

if(ARG_PREFIX)
set(TEST_NAME "${ARG_PREFIX}-${TEST_NAME}")
endif()

if(ARG_SOURCES)
set(SOURCES ${ARG_SOURCES})
else()
set(SOURCES "${REL_TEST_NAME}.cc")
endif()

# Make sure the executable name contains only hyphens, not underscores
string(REPLACE "_" "-" TEST_NAME ${TEST_NAME})

set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}")
add_executable(${TEST_NAME} ${SOURCES})

# target_link_libraries(${TEST_NAME} PRIVATE azurefs_shared)
# With OSX and conda, we need to set the correct RPATH so that dependencies
# are found. The installed libraries with conda have an RPATH that matches
# for executables and libraries lying in $ENV{CONDA_PREFIX}/bin or
# $ENV{CONDA_PREFIX}/lib but our test libraries and executables are not
# installed there.
if(NOT "$ENV{CONDA_PREFIX}" STREQUAL "" AND APPLE)
set_target_properties(${TEST_NAME}
PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH TRUE
INSTALL_RPATH
"${EXECUTABLE_OUTPUT_PATH};$ENV{CONDA_PREFIX}/lib")
endif()

if(ARG_STATIC_LINK_LIBS)
# Customize link libraries
target_link_libraries(${TEST_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS})
else()
target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_TEST_LINK_LIBS})
endif()

if(ARG_PRECOMPILED_HEADER_LIB)
reuse_precompiled_header_lib(${TEST_NAME} ${ARG_PRECOMPILED_HEADER_LIB})
endif()

if(ARG_PRECOMPILED_HEADERS AND ARROW_USE_PRECOMPILED_HEADERS)
target_precompile_headers(${TEST_NAME} PRIVATE ${ARG_PRECOMPILED_HEADERS})
endif()

if(ARG_EXTRA_LINK_LIBS)
target_link_libraries(${TEST_NAME} PRIVATE ${ARG_EXTRA_LINK_LIBS})
endif()

if(ARG_EXTRA_INCLUDES)
target_include_directories(${TEST_NAME} SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES})
endif()

if(ARG_EXTRA_DEPENDENCIES)
add_dependencies(${TEST_NAME} ${ARG_EXTRA_DEPENDENCIES})
endif()

if(ARROW_TEST_MEMCHECK AND NOT ARG_NO_VALGRIND)
add_test(${TEST_NAME}
bash
-c
"cd '${CMAKE_SOURCE_DIR}'; \
valgrind --suppressions=valgrind.supp --tool=memcheck --gen-suppressions=all \
--num-callers=500 --leak-check=full --leak-check-heuristics=stdstring \
--error-exitcode=1 ${TEST_PATH} ${ARG_TEST_ARGUMENTS}")
elseif(WIN32)
add_test(${TEST_NAME} ${TEST_PATH} ${ARG_TEST_ARGUMENTS})
else()
add_test(${TEST_NAME}
${BUILD_SUPPORT_DIR}/run-test.sh
${CMAKE_BINARY_DIR}
test
${TEST_PATH}
${ARG_TEST_ARGUMENTS})
endif()

# Add test as dependency of relevant targets
add_dependencies(all-tests ${TEST_NAME})
foreach(TARGET ${ARG_LABELS})
add_dependencies(${TARGET} ${TEST_NAME})
endforeach()

set(LABELS)
list(APPEND LABELS "unittest")
if(ARG_LABELS)
list(APPEND LABELS ${ARG_LABELS})
endif()
# EXTRA_LABELS don't create their own dependencies, they are only used
# to ease running certain test categories.
if(ARG_EXTRA_LABELS)
list(APPEND LABELS ${ARG_EXTRA_LABELS})
endif()

foreach(LABEL ${ARG_LABELS})
# ensure there is a cmake target which exercises tests with this LABEL
set(LABEL_TEST_NAME "test-${LABEL}")
if(NOT TARGET ${LABEL_TEST_NAME})
add_custom_target(${LABEL_TEST_NAME}
ctest -L "${LABEL}" --output-on-failure
USES_TERMINAL)
endif()
# ensure the test is (re)built before the LABEL test runs
add_dependencies(${LABEL_TEST_NAME} ${TEST_NAME})
endforeach()

set_property(TEST ${TEST_NAME}
APPEND
PROPERTY LABELS ${LABELS})
endfunction()

function(ADD_THIRDPARTY_LIB LIB_NAME)
set(options)
set(one_value_args SHARED_LIB STATIC_LIB)
Expand Down
2 changes: 2 additions & 0 deletions cpp/cmake_modules/DefineOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
#----------------------------------------------------------------------
set_option_category("Project component")

define_option(ARROW_AZURE "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF)

define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF)

define_option(ARROW_COMPUTE "Build the Arrow Compute Modules" OFF)
Expand Down
47 changes: 47 additions & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4513,6 +4513,53 @@ if(ARROW_S3)
endif()
endif()

macro(build_azuresdk)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you build Azure C++ SDK by externalproject_add in this?

message(STATUS "Building Azure C++ SDK from source")

set(AZURESDK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/azuresdk_ep-install")
set(AZURESDK_INCLUDE_DIR "${AZURESDK_PREFIX}/include")

set(AZURESDK_CMAKE_ARGS
${EP_COMMON_CMAKE_ARGS}
-DBUILD_TESTING=OFF
-DCMAKE_INSTALL_LIBDIR=lib
"-DCMAKE_INSTALL_PREFIX=${AZURESDK_PREFIX}"
-DCMAKE_PREFIX_PATH=${AZURESDK_PREFIX})

file(MAKE_DIRECTORY ${AZURESDK_INCLUDE_DIR})

# Azure C++ SDK related libraries to link statically
set(_AZURESDK_LIBS
azure-core
azure-identity
azure-storage-blobs
azure-storage-common
azure-storage-files-datalake)
set(AZURESDK_LIBRARIES)
set(AZURESDK_LIBRARIES_CPP)
foreach(_AZURESDK_LIB ${_AZURESDK_LIBS})
string(TOUPPER ${_AZURESDK_LIB} _AZURESDK_LIB_UPPER)
string(REPLACE "-" "_" _AZURESDK_LIB_NAME_PREFIX ${_AZURESDK_LIB_UPPER})
list(APPEND AZURESDK_LIBRARIES_CPP "${_AZURESDK_LIB}-cpp")
set(_AZURESDK_TARGET_NAME Azure::${_AZURESDK_LIB})
list(APPEND AZURESDK_LIBRARIES ${_AZURESDK_TARGET_NAME})
endforeach()

set(AZURESDK_LINK_LIBRARIES ${AZURESDK_LIBRARIES})
endmacro()

if(ARROW_AZURE)
build_azuresdk()

foreach(AZURESDK_LIBRARY_CPP ${AZURESDK_LIBRARIES_CPP})
find_package(${AZURESDK_LIBRARY_CPP} CONFIG REQUIRED)
endforeach()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needless.


include_directories(SYSTEM ${AZURESDK_INCLUDE_DIR})
message(STATUS "Found AZURE SDK headers: ${AZURESDK_INCLUDE_DIR}")
message(STATUS "Found AZURE SDK libraries: ${AZURESDK_LINK_LIBRARIES}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
message(STATUS "Found AZURE SDK headers: ${AZURESDK_INCLUDE_DIR}")
message(STATUS "Found AZURE SDK libraries: ${AZURESDK_LINK_LIBRARIES}")
message(STATUS "Found Azure SDK headers: ${AZURESDK_INCLUDE_DIR}")
message(STATUS "Found Azure SDK libraries: ${AZURESDK_LINK_LIBRARIES}")

endif()

message(STATUS "All bundled static libraries: ${ARROW_BUNDLED_STATIC_LIBS}")

# Write out the package configurations.
Expand Down
Loading