diff --git a/.asf.yaml b/.asf.yaml index 12438081cfc57..7a0180a94a308 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -16,7 +16,7 @@ # under the License. github: - description: "Apache Arrow is a multi-language toolbox for accelerated data interchange and in-memory processing" + description: "Apache Arrow is the universal columnar format and multi-language toolbox for fast data interchange and in-memory analytics" homepage: https://arrow.apache.org/ collaborators: - anjakefala diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index e448209056d78..eaa2249950fb7 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -19,6 +19,11 @@ name: Archery & Crossbow on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/archery.yml' diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 93bc723cd4304..634448d0c8f25 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -19,6 +19,11 @@ name: C++ on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/cpp.yml' @@ -141,7 +146,15 @@ jobs: path: .docker key: ${{ matrix.image }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.image }}- - - name: Setup Python + - name: Setup Python on hosted runner + if: | + matrix.runs-on == 'ubuntu-latest' + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: 3 + - name: Setup Python on self-hosted runner + if: | + contains(matrix.runs-on, 'self-hosted') run: | sudo apt update sudo apt install -y --no-install-recommends python3 python3-dev python3-pip diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 5f657e6c1bf58..a608888c7e35f 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -19,6 +19,11 @@ name: C# on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.github/workflows/csharp.yml' - 'ci/scripts/csharp_*' @@ -39,7 +44,7 @@ permissions: jobs: ubuntu: - name: AMD64 Ubuntu 18.04 C# ${{ matrix.dotnet }} + name: AMD64 Ubuntu 24.04 C# ${{ matrix.dotnet }} runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 15 @@ -52,6 +57,10 @@ jobs: uses: actions/setup-dotnet@v4.0.1 with: dotnet-version: ${{ matrix.dotnet }} + - name: Setup Python + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: 3 - name: Checkout Arrow uses: actions/checkout@v4 with: diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index d2436fe3c4525..a6d403af7470f 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -18,8 +18,13 @@ name: Dev on: - # always trigger + # always trigger except Dependabot "push" push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' pull_request: concurrency: diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index af9a98ed437f8..763394dacdae7 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -19,6 +19,11 @@ name: Integration on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/integration.yml' diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 5766c63bf5221..a27b3ef2854ca 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -19,6 +19,11 @@ name: Java on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/java.yml' diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 56aa1d0992887..59936e1cd9955 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -19,6 +19,11 @@ name: Java JNI on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/java_jni.yml' diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 031310fd4027b..dfad7de0b644f 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -19,6 +19,11 @@ name: NodeJS on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/js.yml' diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index 7d217b07ad7d7..17362206a81a7 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -19,6 +19,11 @@ name: MATLAB on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.github/workflows/matlab.yml' - 'ci/scripts/matlab*.sh' diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index 1ee824843ed2b..83d19b7d247f9 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 84c8a6553b00f..91f09f6c661ae 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -19,6 +19,11 @@ name: Python on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/python.yml' diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 1897f332f7506..3913ab8f022e8 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -19,6 +19,11 @@ name: R on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - ".github/workflows/r.yml" @@ -59,7 +64,9 @@ env: jobs: ubuntu-minimum-cpp-version: name: Check minimum supported Arrow C++ Version (${{ matrix.cpp_version }}) - runs-on: ubuntu-latest + # We don't provide Apache Arrow C++ 15.0.2 deb packages for Ubuntu 24.04. + # So we use ubuntu-22.04 here. + runs-on: ubuntu-22.04 strategy: matrix: include: @@ -170,7 +177,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -230,7 +237,7 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* @@ -292,7 +299,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@604373da6381bf24206979c74d06a550515601b9 # v4.4.1 + - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index 13da7e62ee0cd..d6cc5c9b97cd4 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -19,6 +19,11 @@ name: C GLib & Ruby on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/ruby.yml' diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 87aa5cb83f714..e241713cf73cd 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -19,6 +19,11 @@ name: Swift on: push: + branches: + - '**' + - '!dependabot/**' + tags: + - '**' paths: - '.dockerignore' - '.github/workflows/swift.yml' @@ -58,6 +63,10 @@ jobs: with: fetch-depth: 0 submodules: recursive + - name: Setup Python on hosted runner + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: 3 - name: Setup Archery run: pip install -e dev/archery[docker] - name: Execute Docker Build diff --git a/README.md b/README.md index 7c31c91a5198a..f49ec4b8d98ee 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,9 @@ ## Powering In-Memory Analytics -Apache Arrow is a development platform for in-memory analytics. It contains a -set of technologies that enable big data systems to process and move data fast. +Apache Arrow is a universal columnar format and multi-language toolbox for fast +data interchange and in-memory analytics. It contains a set of technologies that +enable data systems to efficiently store, process, and move data. Major components of the project include: diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index d1c06000065dc..dff972515022f 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -1660,9 +1660,9 @@ enum { PROP_STORAGE_DATA_TYPE = 1 }; -G_DEFINE_TYPE_WITH_PRIVATE(GArrowExtensionDataType, - garrow_extension_data_type, - GARROW_TYPE_DATA_TYPE) +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowExtensionDataType, + garrow_extension_data_type, + GARROW_TYPE_DATA_TYPE) #define GARROW_EXTENSION_DATA_TYPE_GET_PRIVATE(obj) \ static_cast( \ diff --git a/c_glib/meson.build b/c_glib/meson.build index 214c57747033e..96ca375716bad 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -103,23 +103,23 @@ else endif if arrow_cpp_build_lib_dir == '' - arrow = dependency('arrow') + arrow = dependency('arrow', version: ['>=' + version]) # They are just for checking required modules are enabled. They are built into # libarrow.so. So we don't need additional build flags for them. - dependency('arrow-compute') - dependency('arrow-csv') - dependency('arrow-filesystem') - dependency('arrow-json') + dependency('arrow-compute', version: ['>=' + version]) + dependency('arrow-csv', version: ['>=' + version]) + dependency('arrow-filesystem', version: ['>=' + version]) + dependency('arrow-json', version: ['>=' + version]) - have_arrow_orc = dependency('arrow-orc', required: false).found() - arrow_cuda = dependency('arrow-cuda', required: false) + have_arrow_orc = dependency('arrow-orc', required: false, version: ['>=' + version]).found() + arrow_cuda = dependency('arrow-cuda', required: false, version: ['>=' + version]) # we do not support compiling glib without acero engine - arrow_acero = dependency('arrow-acero', required: true) - arrow_dataset = dependency('arrow-dataset', required: false) - arrow_flight = dependency('arrow-flight', required: false) - arrow_flight_sql = dependency('arrow-flight-sql', required: false) - gandiva = dependency('gandiva', required: false) - parquet = dependency('parquet', required: false) + arrow_acero = dependency('arrow-acero', required: true, version: ['>=' + version]) + arrow_dataset = dependency('arrow-dataset', required: false, version: ['>=' + version]) + arrow_flight = dependency('arrow-flight', required: false, version: ['>=' + version]) + arrow_flight_sql = dependency('arrow-flight-sql', required: false, version: ['>=' + version]) + gandiva = dependency('gandiva', required: false, version: ['>=' + version]) + parquet = dependency('parquet', required: false, version: ['>=' + version]) else base_include_directories += [ include_directories(join_paths(arrow_cpp_build_dir, 'src')), diff --git a/c_glib/test/test-decimal128-data-type.rb b/c_glib/test/test-decimal128-data-type.rb index f0e62c9d131b4..8cf97e38d47b5 100644 --- a/c_glib/test/test-decimal128-data-type.rb +++ b/c_glib/test/test-decimal128-data-type.rb @@ -47,9 +47,7 @@ def test_decimal_data_type_new end def test_invalid_precision - message = - "[decimal128-data-type][new]: Invalid: Decimal precision out of range [1, 38]: 39" - assert_raise(Arrow::Error::Invalid.new(message)) do + assert_raise(Arrow::Error::Invalid) do Arrow::Decimal128DataType.new(39, 1) end end diff --git a/c_glib/test/test-decimal256-data-type.rb b/c_glib/test/test-decimal256-data-type.rb index 6d803f7ce9020..f5f89c2c46502 100644 --- a/c_glib/test/test-decimal256-data-type.rb +++ b/c_glib/test/test-decimal256-data-type.rb @@ -47,9 +47,7 @@ def test_decimal_data_type_new end def test_invalid_precision - message = - "[decimal256-data-type][new]: Invalid: Decimal precision out of range [1, 76]: 77" - assert_raise(Arrow::Error::Invalid.new(message)) do + assert_raise(Arrow::Error::Invalid) do Arrow::Decimal256DataType.new(77, 1) end end diff --git a/c_glib/test/test-extension-data-type.rb b/c_glib/test/test-extension-data-type.rb index 6c114b81e2c33..8dfee88518a0e 100644 --- a/c_glib/test/test-extension-data-type.rb +++ b/c_glib/test/test-extension-data-type.rb @@ -102,4 +102,10 @@ def test_wrap_chunked_array extension_chunked_array.chunks.collect(&:class), ]) end + + def test_abstract_class + assert_raise(TypeError) do + Arrow::ExtensionDataType.new + end + end end diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 08a052e82f24d..084117f38778a 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -139,7 +139,7 @@ set PARQUET_HOME=%CONDA_PREFIX%\Library @rem Download IANA Timezone Database to a non-standard location to @rem test the configurability of the timezone database path -curl https://data.iana.org/time-zones/releases/tzdata2021e.tar.gz --output tzdata.tar.gz || exit /B +curl https://data.iana.org/time-zones/releases/tzdata2024b.tar.gz --output tzdata.tar.gz || exit /B mkdir %USERPROFILE%\Downloads\test\tzdata tar --extract --file tzdata.tar.gz --directory %USERPROFILE%\Downloads\test\tzdata curl https://raw.githubusercontent.com/unicode-org/cldr/master/common/supplemental/windowsZones.xml ^ diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index 885019ff3049f..6365fcfacfc38 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -55,7 +55,6 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_GANDIVA:=OFF} : ${ARROW_GCS:=ON} : ${ARROW_HDFS:=ON} -: ${ARROW_JEMALLOC:=ON} : ${ARROW_MIMALLOC:=ON} : ${ARROW_ORC:=ON} : ${ARROW_PARQUET:=ON} @@ -81,6 +80,9 @@ if [[ "$(uname -m)" == arm* ]] || [[ "$(uname -m)" == aarch* ]]; then # 4k and 64k page arm64 systems. For more context see # https://github.com/apache/arrow/issues/10929 export ARROW_EXTRA_CMAKE_FLAGS="-DARROW_JEMALLOC_LG_PAGE=16" + : ${ARROW_JEMALLOC:=OFF} +else + : ${ARROW_JEMALLOC:=ON} fi mkdir /tmp/arrow-build diff --git a/cpp/README.md b/cpp/README.md index a070a16e74b56..2c05b10fe1331 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -31,4 +31,4 @@ to install pre-compiled binary versions of the library. Please refer to our latest [C++ Development Documentation][1]. -[1]: https://github.com/apache/arrow/blob/main/docs/source/developers/cpp +[1]: https://arrow.apache.org/docs/dev/developers/cpp/ diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 692efa78376f4..90839cb44627c 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -97,28 +97,43 @@ function(arrow_create_merged_static_lib output_target) endforeach() if(APPLE) - # The apple-distributed libtool is what we want for bundling, but there is - # a GNU libtool that has a namecollision (and happens to be bundled with R, too). - # We are not compatible with GNU libtool, so we need to avoid it. - - # check in the obvious places first to find Apple's libtool - # HINTS is used before system paths and before PATHS, so we use that - # even though hard coded paths should go in PATHS - # TODO: use a VALIDATOR when we require cmake >= 3.25 - find_program(LIBTOOL_MACOS libtool HINTS /usr/bin - /Library/Developer/CommandLineTools/usr/bin) - - # confirm that the libtool we found is not GNU libtool + if(CMAKE_LIBTOOL) + set(LIBTOOL_MACOS ${CMAKE_LIBTOOL}) + else() + # The apple-distributed libtool is what we want for bundling, but there is + # a GNU libtool that has a namecollision (and happens to be bundled with R, too). + # We are not compatible with GNU libtool, so we need to avoid it. + + # check in the obvious places first to find Apple's libtool + # HINTS is used before system paths and before PATHS, so we use that + # even though hard coded paths should go in PATHS + # TODO: use a VALIDATOR when we require cmake >= 3.25 + find_program(LIBTOOL_MACOS libtool + HINTS /usr/bin /Library/Developer/CommandLineTools/usr/bin) + endif() + + # confirm that the libtool we found is Apple's libtool execute_process(COMMAND ${LIBTOOL_MACOS} -V OUTPUT_VARIABLE LIBTOOL_V_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE) if(NOT "${LIBTOOL_V_OUTPUT}" MATCHES ".*cctools-([0-9.]+).*") - message(FATAL_ERROR "libtool found appears to be the incompatible GNU libtool: ${LIBTOOL_MACOS}" + message(FATAL_ERROR "libtool found appears not to be Apple's libtool: ${LIBTOOL_MACOS}" ) endif() set(BUNDLE_COMMAND ${LIBTOOL_MACOS} "-no_warning_for_no_symbols" "-static" "-o" ${output_lib_path} ${all_library_paths}) + elseif(MSVC) + if(CMAKE_LIBTOOL) + set(BUNDLE_TOOL ${CMAKE_LIBTOOL}) + else() + find_program(BUNDLE_TOOL lib HINTS "${CMAKE_CXX_COMPILER}/..") + if(NOT BUNDLE_TOOL) + message(FATAL_ERROR "Cannot locate lib.exe to bundle libraries") + endif() + endif() + set(BUNDLE_COMMAND ${BUNDLE_TOOL} /NOLOGO /OUT:${output_lib_path} + ${all_library_paths}) elseif(CMAKE_CXX_COMPILER_ID MATCHES "^(Clang|GNU|Intel|IntelLLVM)$") set(ar_script_path ${CMAKE_BINARY_DIR}/${ARG_NAME}.ar) @@ -140,18 +155,6 @@ function(arrow_create_merged_static_lib output_target) endif() set(BUNDLE_COMMAND ${ar_tool} -M < ${ar_script_path}) - - elseif(MSVC) - if(CMAKE_LIBTOOL) - set(BUNDLE_TOOL ${CMAKE_LIBTOOL}) - else() - find_program(BUNDLE_TOOL lib HINTS "${CMAKE_CXX_COMPILER}/..") - if(NOT BUNDLE_TOOL) - message(FATAL_ERROR "Cannot locate lib.exe to bundle libraries") - endif() - endif() - set(BUNDLE_COMMAND ${BUNDLE_TOOL} /NOLOGO /OUT:${output_lib_path} - ${all_library_paths}) else() message(FATAL_ERROR "Unknown bundle scenario!") endif() diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 755887314d110..a7bf9e59f8aa2 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -364,7 +364,8 @@ takes precedence over ccache if a storage backend is configured" ON) set(ARROW_JEMALLOC_DESCRIPTION "Build the Arrow jemalloc-based allocator") if(WIN32 - OR "${CMAKE_SYSTEM_NAME}" STREQUAL "FreeBSD" + OR CMAKE_SYSTEM_NAME STREQUAL "FreeBSD" + OR CMAKE_SYSTEM_PROCESSOR MATCHES "aarch|ARM|arm" OR NOT ARROW_ENABLE_THREADING) # jemalloc is not supported on Windows. # @@ -372,6 +373,9 @@ takes precedence over ccache if a storage backend is configured" ON) # be built with --disable-libdl on FreeBSD. Because lazy-lock feature # is required on FreeBSD. Lazy-lock feature requires libdl. # + # jemalloc may have a problem on ARM. + # See also: https://github.com/apache/arrow/issues/44342 + # # jemalloc requires thread. define_option(ARROW_JEMALLOC ${ARROW_JEMALLOC_DESCRIPTION} OFF) else() @@ -636,6 +640,11 @@ Always OFF if building binaries" OFF) "Compiler flags to append when pre-compiling Gandiva operations" "") + #---------------------------------------------------------------------- + set_option_category("Cross compiling") + + define_option_string(ARROW_GRPC_CPP_PLUGIN "grpc_cpp_plugin path to be used" "") + #---------------------------------------------------------------------- set_option_category("Advanced developer") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index db151b4e0f44b..0b215b5b25c62 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -4223,6 +4223,14 @@ if(ARROW_WITH_GRPC) target_link_libraries(gRPC::grpc++ INTERFACE gRPC::grpc_asan_suppressed) endif() endif() + + if(ARROW_GRPC_CPP_PLUGIN) + if(NOT TARGET gRPC::grpc_cpp_plugin) + add_executable(gRPC::grpc_cpp_plugin IMPORTED) + endif() + set_target_properties(gRPC::grpc_cpp_plugin PROPERTIES IMPORTED_LOCATION + ${ARROW_GRPC_CPP_PLUGIN}) + endif() endif() # ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index c911f0f4e9481..5f6b568460afe 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -460,7 +460,7 @@ endif() set(ARROW_VENDORED_SRCS vendored/base64.cpp - vendored/datetime/tz.cpp + vendored/datetime.cpp vendored/double-conversion/bignum-dtoa.cc vendored/double-conversion/bignum.cc vendored/double-conversion/cached-powers.cc @@ -488,7 +488,7 @@ set(ARROW_VENDORED_SRCS if(APPLE) list(APPEND ARROW_VENDORED_SRCS vendored/datetime/ios.mm) endif() -set_source_files_properties(vendored/datetime/tz.cpp +set_source_files_properties(vendored/datetime.cpp PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) arrow_add_object_library(ARROW_VENDORED ${ARROW_VENDORED_SRCS}) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 1851ef9122274..d0e5b6d3c0edf 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -43,7 +43,7 @@ namespace arrow { // VarLengthListLikeBuilder template -class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { +class VarLengthListLikeBuilder : public ArrayBuilder { public: using TypeClass = TYPE; using offset_type = typename TypeClass::offset_type; @@ -261,7 +261,7 @@ class ARROW_EXPORT VarLengthListLikeBuilder : public ArrayBuilder { // ListBuilder / LargeListBuilder template -class ARROW_EXPORT BaseListBuilder : public VarLengthListLikeBuilder { +class BaseListBuilder : public VarLengthListLikeBuilder { private: using BASE = VarLengthListLikeBuilder; @@ -401,7 +401,7 @@ class ARROW_EXPORT LargeListBuilder : public BaseListBuilder { // ListViewBuilder / LargeListViewBuilder template -class ARROW_EXPORT BaseListViewBuilder : public VarLengthListLikeBuilder { +class BaseListViewBuilder : public VarLengthListLikeBuilder { private: using BASE = VarLengthListLikeBuilder; diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index bda6b17810299..ca74ffa06c820 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -26,7 +26,7 @@ #include "arrow/array.h" #include "arrow/record_batch.h" -namespace arrow::internal { +namespace arrow { namespace { template @@ -167,4 +167,4 @@ void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_i logical_index_vec, out_chunk_location_vec, chunk_hint); } -} // namespace arrow::internal +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index 4a5e27c05361f..ab0e753d0040e 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -27,12 +27,12 @@ #include "arrow/type_fwd.h" #include "arrow/util/macros.h" -namespace arrow::internal { +namespace arrow { -struct ChunkResolver; +class ChunkResolver; template -struct TypedChunkLocation { +struct ARROW_EXPORT TypedChunkLocation { /// \brief Index of the chunk in the array of chunks /// /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used @@ -41,7 +41,7 @@ struct TypedChunkLocation { /// \brief Index of the value in the chunk /// - /// The value is UNDEFINED if chunk_index >= chunks.size() + /// The value is UNDEFINED if `chunk_index >= chunks.size()` IndexType index_in_chunk = 0; TypedChunkLocation() = default; @@ -61,7 +61,7 @@ using ChunkLocation = TypedChunkLocation; /// \brief An utility that incrementally resolves logical indices into /// physical indices in a chunked array. -struct ARROW_EXPORT ChunkResolver { +class ARROW_EXPORT ChunkResolver { private: /// \brief Array containing `chunks.size() + 1` offsets. /// @@ -76,14 +76,11 @@ struct ARROW_EXPORT ChunkResolver { public: explicit ChunkResolver(const ArrayVector& chunks) noexcept; + explicit ChunkResolver(const std::vector& chunks) noexcept; + explicit ChunkResolver(const RecordBatchVector& batches) noexcept; - /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. - /// - /// The first offset must be 0 and the last offset must be the logical length of the - /// chunked array. Each offset before the last represents the starting logical index of - /// the corresponding chunk. explicit ChunkResolver(std::vector offsets) noexcept : offsets_(std::move(offsets)), cached_chunk_(0) { #ifndef NDEBUG @@ -115,11 +112,11 @@ struct ARROW_EXPORT ChunkResolver { /// The returned ChunkLocation contains the chunk index and the within-chunk index /// equivalent to the logical index. /// - /// \pre index >= 0 - /// \post location.chunk_index in [0, chunks.size()] + /// \pre `index >= 0` + /// \post `location.chunk_index` in `[0, chunks.size()]` /// \param index The logical index to resolve /// \return ChunkLocation with a valid chunk_index if index is within - /// bounds, or with chunk_index == chunks.size() if logical index is + /// bounds, or with `chunk_index == chunks.size()` if logical index is /// `>= chunked_array.length()`. inline ChunkLocation Resolve(int64_t index) const { const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); @@ -133,13 +130,13 @@ struct ARROW_EXPORT ChunkResolver { /// The returned ChunkLocation contains the chunk index and the within-chunk index /// equivalent to the logical index. /// - /// \pre index >= 0 - /// \post location.chunk_index in [0, chunks.size()] + /// \pre `index >= 0` + /// \post `location.chunk_index` in `[0, chunks.size()]` /// \param index The logical index to resolve /// \param hint ChunkLocation{} or the last ChunkLocation returned by /// this ChunkResolver. /// \return ChunkLocation with a valid chunk_index if index is within - /// bounds, or with chunk_index == chunks.size() if logical index is + /// bounds, or with `chunk_index == chunks.size()` if logical index is /// `>= chunked_array.length()`. inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { assert(hint.chunk_index < static_cast(offsets_.size())); @@ -281,4 +278,13 @@ struct ARROW_EXPORT ChunkResolver { } }; -} // namespace arrow::internal +// Explicitly instantiate template base struct, for DLL linking on Windows +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +template struct TypedChunkLocation; +} // namespace arrow diff --git a/cpp/src/arrow/chunk_resolver_benchmark.cc b/cpp/src/arrow/chunk_resolver_benchmark.cc index 0756de3fbe930..a6f539a444bbc 100644 --- a/cpp/src/arrow/chunk_resolver_benchmark.cc +++ b/cpp/src/arrow/chunk_resolver_benchmark.cc @@ -28,9 +28,6 @@ namespace arrow { -using internal::ChunkResolver; -using internal::TypedChunkLocation; - namespace { int64_t constexpr kChunkedArrayLength = std::numeric_limits::max(); diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index c65b6cb6e227f..02bcd0f9026bc 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -199,7 +199,7 @@ class ARROW_EXPORT ChunkedArray { private: template friend class ::arrow::stl::ChunkedArrayIterator; - internal::ChunkResolver chunk_resolver_; + ChunkResolver chunk_resolver_; ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray); }; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index f98dde689c237..b3944fd1b1927 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -36,10 +36,6 @@ namespace arrow { -using internal::ChunkLocation; -using internal::ChunkResolver; -using internal::TypedChunkLocation; - class TestChunkedArray : public ::testing::Test { protected: virtual void Construct() { diff --git a/cpp/src/arrow/compute/kernels/chunked_internal.h b/cpp/src/arrow/compute/kernels/chunked_internal.h index 2b72e0ab3109e..f7cb615f3ed81 100644 --- a/cpp/src/arrow/compute/kernels/chunked_internal.h +++ b/cpp/src/arrow/compute/kernels/chunked_internal.h @@ -52,7 +52,7 @@ struct ResolvedChunk { class ChunkedArrayResolver { private: - ::arrow::internal::ChunkResolver resolver_; + ChunkResolver resolver_; std::vector chunks_; public: diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index 8766ca3baac96..395ed86a06b4a 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -24,7 +24,6 @@ namespace arrow { using internal::checked_cast; -using internal::ChunkLocation; namespace compute { namespace internal { @@ -852,7 +851,7 @@ class TableSorter { const RecordBatchVector batches_; const SortOptions& options_; const NullPlacement null_placement_; - const ::arrow::internal::ChunkResolver left_resolver_, right_resolver_; + const ::arrow::ChunkResolver left_resolver_, right_resolver_; const std::vector sort_keys_; uint64_t* indices_begin_; uint64_t* indices_end_; diff --git a/cpp/src/arrow/compute/kernels/vector_sort_internal.h b/cpp/src/arrow/compute/kernels/vector_sort_internal.h index 564afb8c087d2..bee7f838a05da 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_sort_internal.h @@ -749,9 +749,9 @@ struct ResolvedTableSortKey { order(order), null_count(null_count) {} - using LocationType = ::arrow::internal::ChunkLocation; + using LocationType = ::arrow::ChunkLocation; - ResolvedChunk GetChunk(::arrow::internal::ChunkLocation loc) const { + ResolvedChunk GetChunk(::arrow::ChunkLocation loc) const { return {chunks[loc.chunk_index], loc.index_in_chunk}; } diff --git a/cpp/src/arrow/compute/row/row_encoder_internal.h b/cpp/src/arrow/compute/row/row_encoder_internal.h index 4d6cc34af2342..c3275283d5a66 100644 --- a/cpp/src/arrow/compute/row/row_encoder_internal.h +++ b/cpp/src/arrow/compute/row/row_encoder_internal.h @@ -137,7 +137,7 @@ struct ARROW_EXPORT DictionaryKeyEncoder : FixedWidthKeyEncoder { }; template -struct ARROW_EXPORT VarLengthKeyEncoder : KeyEncoder { +struct VarLengthKeyEncoder : KeyEncoder { using Offset = typename T::offset_type; void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override { diff --git a/cpp/src/arrow/engine/substrait/serde.cc b/cpp/src/arrow/engine/substrait/serde.cc index 16d2ace4ac0d7..6b4c05a3b1dd5 100644 --- a/cpp/src/arrow/engine/substrait/serde.cc +++ b/cpp/src/arrow/engine/substrait/serde.cc @@ -56,7 +56,7 @@ Status ParseFromBufferImpl(const Buffer& buf, const std::string& full_name, if (message->ParseFromZeroCopyStream(&buf_stream)) { return Status::OK(); } - return Status::IOError("ParseFromZeroCopyStream failed for ", full_name); + return Status::Invalid("ParseFromZeroCopyStream failed for ", full_name); } template diff --git a/cpp/src/arrow/flight/integration_tests/test_integration.cc b/cpp/src/arrow/flight/integration_tests/test_integration.cc index da6fcf81eb737..f38076822c778 100644 --- a/cpp/src/arrow/flight/integration_tests/test_integration.cc +++ b/cpp/src/arrow/flight/integration_tests/test_integration.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -1026,6 +1027,131 @@ class AppMetadataFlightInfoEndpointScenario : public Scenario { } }; +/// \brief The server used for testing do_exchange +class DoExchangeServer : public FlightServerBase { + public: + DoExchangeServer() : FlightServerBase() {} + + Status DoExchange(const ServerCallContext& context, + std::unique_ptr reader, + std::unique_ptr writer) override { + if (reader->descriptor().type != FlightDescriptor::DescriptorType::CMD) { + return Status::Invalid("Must provide a command descriptor"); + } + + const std::string& cmd = reader->descriptor().cmd; + if (cmd == "echo") { + return RunEchoExchange(reader, writer); + } else { + return Status::NotImplemented("Command not implemented: ", cmd); + } + } + + private: + static Status RunEchoExchange(std::unique_ptr& reader, + std::unique_ptr& writer) { + FlightStreamChunk chunk; + bool begun = false; + while (true) { + ARROW_ASSIGN_OR_RAISE(chunk, reader->Next()); + if (!chunk.data && !chunk.app_metadata) { + break; + } + if (!begun && chunk.data) { + begun = true; + RETURN_NOT_OK(writer->Begin(chunk.data->schema())); + } + if (chunk.data && chunk.app_metadata) { + RETURN_NOT_OK(writer->WriteWithMetadata(*chunk.data, chunk.app_metadata)); + } else if (chunk.data) { + RETURN_NOT_OK(writer->WriteRecordBatch(*chunk.data)); + } else if (chunk.app_metadata) { + RETURN_NOT_OK(writer->WriteMetadata(chunk.app_metadata)); + } + } + return Status::OK(); + } +}; + +/// \brief The DoExchangeEcho scenario. +/// +/// This tests that the client and server can perform a two-way data exchange. +/// +/// The server should echo back any data sent by the client. +class DoExchangeEchoScenario : public Scenario { + Status MakeServer(std::unique_ptr* server, + FlightServerOptions* options) override { + *server = std::make_unique(); + return Status::OK(); + } + + Status MakeClient(FlightClientOptions* options) override { return Status::OK(); } + + Status RunClient(std::unique_ptr client) override { + auto descriptor = FlightDescriptor::Command("echo"); + FlightCallOptions call_options; + + ARROW_ASSIGN_OR_RAISE(auto do_exchange_result, + client->DoExchange(call_options, descriptor)); + std::unique_ptr writer = std::move(do_exchange_result.writer); + std::unique_ptr reader = std::move(do_exchange_result.reader); + + auto schema = arrow::schema({field("x", int32(), false)}); + ARROW_RETURN_NOT_OK(writer->Begin(schema)); + + ARROW_ASSIGN_OR_RAISE(auto builder, + RecordBatchBuilder::Make(schema, arrow::default_memory_pool())); + + for (int batch_idx = 0; batch_idx < 4; ++batch_idx) { + auto int_builder = builder->GetFieldAs(0); + std::vector batch_data(10); + std::iota(batch_data.begin(), batch_data.end(), batch_idx); + ARROW_RETURN_NOT_OK(int_builder->AppendValues(batch_data)); + ARROW_ASSIGN_OR_RAISE(auto record_batch, builder->Flush()); + + std::string app_metadata = std::to_string(batch_idx); + bool write_metadata = batch_idx % 2 == 0; + + if (write_metadata) { + ARROW_RETURN_NOT_OK( + writer->WriteWithMetadata(*record_batch, Buffer::FromString(app_metadata))); + } else { + ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch)); + } + + ARROW_ASSIGN_OR_RAISE(auto read_result, reader->Next()); + if (read_result.data == nullptr) { + return Status::Invalid("Received null data"); + } + if (!read_result.data->Equals(*record_batch)) { + return Status::Invalid("Read data doesn't match expected data for batch ", + std::to_string(batch_idx), ".\n", "Expected:\n", + record_batch->ToString(), "Actual:\n", + read_result.data->ToString()); + } + + if (write_metadata) { + if (read_result.app_metadata == nullptr) { + return Status::Invalid("Received null app metadata"); + } + if (read_result.app_metadata->ToString() != app_metadata) { + return Status::Invalid("Read metadata doesn't match expected for batch ", + std::to_string(batch_idx), ".\n", "Expected:\n", + app_metadata, "\nActual:\n", + read_result.app_metadata->ToString()); + } + } else if (read_result.app_metadata != nullptr) { + return Status::Invalid("Expected no app metadata but received non-null metadata"); + } + } + + ARROW_RETURN_NOT_OK(writer->DoneWriting()); + ARROW_RETURN_NOT_OK(writer->Close()); + + return Status::OK(); + } +}; + /// \brief Schema to be returned for mocking the statement/prepared statement results. /// /// Must be the same across all languages. @@ -2283,6 +2409,9 @@ Status GetScenario(const std::string& scenario_name, std::shared_ptr* } else if (scenario_name == "app_metadata_flight_info_endpoint") { *out = std::make_shared(); return Status::OK(); + } else if (scenario_name == "do_exchange:echo") { + *out = std::make_shared(); + return Status::OK(); } else if (scenario_name == "flight_sql") { *out = std::make_shared(); return Status::OK(); diff --git a/cpp/src/arrow/io/concurrency.h b/cpp/src/arrow/io/concurrency.h index 43ceb8debcecb..35c2aac6a7e15 100644 --- a/cpp/src/arrow/io/concurrency.h +++ b/cpp/src/arrow/io/concurrency.h @@ -89,7 +89,7 @@ class ARROW_EXPORT SharedExclusiveChecker { // wrappers between those two classes. template -class ARROW_EXPORT InputStreamConcurrencyWrapper : public InputStream { +class InputStreamConcurrencyWrapper : public InputStream { public: Status Close() final { auto guard = lock_.exclusive_guard(); @@ -159,7 +159,7 @@ class ARROW_EXPORT InputStreamConcurrencyWrapper : public InputStream { }; template -class ARROW_EXPORT RandomAccessFileConcurrencyWrapper : public RandomAccessFile { +class RandomAccessFileConcurrencyWrapper : public RandomAccessFile { public: Status Close() final { auto guard = lock_.exclusive_guard(); diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 7a273c46c1991..7ef37301203bc 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -134,7 +134,7 @@ namespace internal { constexpr auto kScalarScratchSpaceSize = sizeof(int64_t) * 2; template -struct ARROW_EXPORT ArraySpanFillFromScalarScratchSpace { +struct ArraySpanFillFromScalarScratchSpace { // 16 bytes of scratch space to enable ArraySpan to be a view onto any // Scalar- including binary scalars where we need to create a buffer // that looks like two 32-bit or 64-bit offsets. @@ -163,7 +163,7 @@ struct ARROW_EXPORT PrimitiveScalarBase : public Scalar { }; template -struct ARROW_EXPORT PrimitiveScalar : public PrimitiveScalarBase { +struct PrimitiveScalar : public PrimitiveScalarBase { using PrimitiveScalarBase::PrimitiveScalarBase; using TypeClass = T; using ValueType = CType; @@ -464,7 +464,7 @@ struct ARROW_EXPORT Date64Scalar : public DateScalar { }; template -struct ARROW_EXPORT TimeScalar : public TemporalScalar { +struct TimeScalar : public TemporalScalar { using TemporalScalar::TemporalScalar; TimeScalar(typename TemporalScalar::ValueType value, TimeUnit::type unit) @@ -543,7 +543,7 @@ struct ARROW_EXPORT DurationScalar : public TemporalScalar { }; template -struct ARROW_EXPORT DecimalScalar : public internal::PrimitiveScalarBase { +struct DecimalScalar : public internal::PrimitiveScalarBase { using internal::PrimitiveScalarBase::PrimitiveScalarBase; using TypeClass = TYPE_CLASS; using ValueType = VALUE_TYPE; diff --git a/cpp/src/arrow/stl_iterator.h b/cpp/src/arrow/stl_iterator.h index 5f2acfb071b29..577066cba0fcd 100644 --- a/cpp/src/arrow/stl_iterator.h +++ b/cpp/src/arrow/stl_iterator.h @@ -237,7 +237,7 @@ class ChunkedArrayIterator { } private: - arrow::internal::ChunkLocation GetChunkLocation(int64_t index) const { + arrow::ChunkLocation GetChunkLocation(int64_t index) const { assert(chunked_array_); return chunked_array_->chunk_resolver_.Resolve(index); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3d7786bd37e09..53207bb9da8a6 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -548,7 +548,7 @@ ARROW_EXPORT void PrintTo(const Field& field, std::ostream* os); namespace detail { template -class ARROW_EXPORT CTypeImpl : public BASE { +class CTypeImpl : public BASE { public: static constexpr Type::type type_id = TYPE_ID; using c_type = C_TYPE; diff --git a/cpp/src/arrow/util/basic_decimal.h b/cpp/src/arrow/util/basic_decimal.h index fac40a46da8f6..9c1f2e479c712 100644 --- a/cpp/src/arrow/util/basic_decimal.h +++ b/cpp/src/arrow/util/basic_decimal.h @@ -40,7 +40,7 @@ enum class DecimalStatus { }; template -class ARROW_EXPORT GenericBasicDecimal { +class GenericBasicDecimal { protected: struct LittleEndianArrayTag {}; diff --git a/cpp/src/arrow/util/ubsan.h b/cpp/src/arrow/util/ubsan.h index 900d8011dfd69..2308ee33519ca 100644 --- a/cpp/src/arrow/util/ubsan.h +++ b/cpp/src/arrow/util/ubsan.h @@ -63,7 +63,7 @@ inline std::enable_if_t, T> SafeLoadAs( template inline std::enable_if_t, T> SafeLoad(const T* unaligned) { std::remove_const_t ret; - std::memcpy(&ret, unaligned, sizeof(T)); + std::memcpy(&ret, static_cast(unaligned), sizeof(T)); return ret; } @@ -73,7 +73,7 @@ inline std::enable_if_t && U> SafeCopy(T value) { std::remove_const_t ret; - std::memcpy(&ret, &value, sizeof(T)); + std::memcpy(&ret, static_cast(&value), sizeof(T)); return ret; } diff --git a/cpp/src/arrow/vendored/datetime.cpp b/cpp/src/arrow/vendored/datetime.cpp new file mode 100644 index 0000000000000..0f0bd12c7e160 --- /dev/null +++ b/cpp/src/arrow/vendored/datetime.cpp @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "datetime/visibility.h" +#include "datetime/tz.cpp" diff --git a/cpp/src/arrow/vendored/datetime.h b/cpp/src/arrow/vendored/datetime.h index e437cdcbc2dae..aea31ebe77f9e 100644 --- a/cpp/src/arrow/vendored/datetime.h +++ b/cpp/src/arrow/vendored/datetime.h @@ -17,10 +17,11 @@ #pragma once -#include "arrow/vendored/datetime/date.h" // IWYU pragma: export -#include "arrow/vendored/datetime/tz.h" // IWYU pragma: export +#include "arrow/vendored/datetime/visibility.h" // IWYU pragma: export +#include "arrow/vendored/datetime/date.h" // IWYU pragma: export +#include "arrow/vendored/datetime/tz.h" // IWYU pragma: export // Can be defined by date.h. #ifdef NOEXCEPT -#undef NOEXCEPT +# undef NOEXCEPT #endif diff --git a/cpp/src/arrow/vendored/datetime/README.md b/cpp/src/arrow/vendored/datetime/README.md index 5a0993b7b4336..89132d9cba0f5 100644 --- a/cpp/src/arrow/vendored/datetime/README.md +++ b/cpp/src/arrow/vendored/datetime/README.md @@ -17,12 +17,16 @@ copies or substantial portions of the Software. Sources for datetime are adapted from Howard Hinnant's date library (https://github.com/HowardHinnant/date). -Sources are taken from changeset 1ead6715dec030d340a316c927c877a3c4e5a00c +Sources are taken from changeset 5bdb7e6f31fac909c090a46dbd9fea27b6e609a4 of the above project. The following changes are made: - fix internal inclusion paths (from "date/xxx.h" to simply "xxx.h") - enclose the `date` namespace inside the `arrow_vendored` namespace -- include a custom "visibility.h" header from "tz.cpp" for proper DLL - exports on Windows -- disable curl-based database downloading in "tz.h" + +## How to update + +```console +$ cd cpp/src/arrow/vendored/datetime +$ ./update.sh 3.0.3 +``` diff --git a/cpp/src/arrow/vendored/datetime/date.h b/cpp/src/arrow/vendored/datetime/date.h index 75e2624296672..c17d6f3f7aa54 100644 --- a/cpp/src/arrow/vendored/datetime/date.h +++ b/cpp/src/arrow/vendored/datetime/date.h @@ -84,9 +84,7 @@ # pragma warning(disable : 4127) #endif -namespace arrow_vendored -{ -namespace date +namespace arrow_vendored::date { //---------------+ @@ -8234,8 +8232,7 @@ operator<<(std::basic_ostream& os, detail::get_units(typename Period::type{}); } -} // namespace date -} // namespace arrow_vendored +} // namespace arrow_vendored::date #ifdef _MSC_VER # pragma warning(pop) diff --git a/cpp/src/arrow/vendored/datetime/ios.h b/cpp/src/arrow/vendored/datetime/ios.h index acad28d13b558..d018e799a833e 100644 --- a/cpp/src/arrow/vendored/datetime/ios.h +++ b/cpp/src/arrow/vendored/datetime/ios.h @@ -32,9 +32,7 @@ # if TARGET_OS_IPHONE # include - namespace arrow_vendored - { - namespace date + namespace arrow_vendored::date { namespace iOSUtils { @@ -43,8 +41,7 @@ std::string get_current_timezone(); } // namespace iOSUtils - } // namespace date - } // namespace arrow_vendored + } // namespace arrow_vendored::date # endif // TARGET_OS_IPHONE #else // !__APPLE__ diff --git a/cpp/src/arrow/vendored/datetime/ios.mm b/cpp/src/arrow/vendored/datetime/ios.mm index 22b7ce6c30bc2..70ba2adf0ed58 100644 --- a/cpp/src/arrow/vendored/datetime/ios.mm +++ b/cpp/src/arrow/vendored/datetime/ios.mm @@ -47,9 +47,7 @@ #define TAR_SIZE_POSITION 124 #define TAR_SIZE_SIZE 12 -namespace arrow_vendored -{ -namespace date +namespace arrow_vendored::date { namespace iOSUtils { @@ -334,7 +332,6 @@ bool writeFile(const std::string &tzdataPath, const std::string &fileName, } } // namespace iOSUtils -} // namespace date -} // namespace arrow_vendored +} // namespace arrow_vendored::date #endif // TARGET_OS_IPHONE diff --git a/cpp/src/arrow/vendored/datetime/tz.cpp b/cpp/src/arrow/vendored/datetime/tz.cpp index 44c627775f3d7..2cf6c62a84d47 100644 --- a/cpp/src/arrow/vendored/datetime/tz.cpp +++ b/cpp/src/arrow/vendored/datetime/tz.cpp @@ -30,10 +30,6 @@ // been invented (that would involve another several millennia of evolution). // We did not mean to shout. -// NOTE(ARROW): This is required so that symbols are properly exported from the DLL -#include "visibility.h" - - #ifdef _WIN32 // windows.h will be included directly and indirectly (e.g. by curl). // We need to define these macros to prevent windows.h bringing in @@ -97,8 +93,25 @@ #endif #if defined(ANDROID) || defined(__ANDROID__) -#include -#endif +# include +# if USE_OS_TZDB +# define MISSING_LEAP_SECONDS 1 +// from https://android.googlesource.com/platform/bionic/+/master/libc/tzcode/bionic.cpp +static constexpr size_t ANDROID_TIMEZONE_NAME_LENGTH = 40; +struct bionic_tzdata_header_t { + char tzdata_version[12]; + std::int32_t index_offset; + std::int32_t data_offset; + std::int32_t final_offset; +}; +struct index_entry_t { + char buf[ANDROID_TIMEZONE_NAME_LENGTH]; + std::int32_t start; + std::int32_t length; + std::int32_t unused; // Was raw GMT offset; always 0 since tzdata2014f (L). +}; +# endif // USE_OS_TZDB +#endif // defined(ANDROID) || defined(__ANDROID__) #if USE_OS_TZDB # include @@ -122,10 +135,13 @@ #include #include -// unistd.h is used on some platforms as part of the means to get +// unistd.h is used on some platforms as part of the the means to get // the current time zone. On Win32 windows.h provides a means to do it. // gcc/mingw supports unistd.h on Win32 but MSVC does not. +#ifdef __ANDROID__ +# define INSTALL . +#endif #ifdef _WIN32 # ifdef WINAPI_FAMILY # include @@ -178,9 +194,9 @@ #ifdef _WIN32 static CONSTDATA char folder_delimiter = '\\'; -#else // !_WIN32 +#elif !defined(ANDROID) && !defined(__ANDROID__) static CONSTDATA char folder_delimiter = '/'; -#endif // !_WIN32 +#endif // !defined(WIN32) && !defined(ANDROID) && !defined(__ANDROID__) #if defined(__GNUC__) && __GNUC__ < 5 // GCC 4.9 Bug 61489 Wrong warning with -Wmissing-field-initializers @@ -191,20 +207,6 @@ static CONSTDATA char folder_delimiter = '/'; #if !USE_OS_TZDB # ifdef _WIN32 -# ifndef WINRT - -namespace -{ - struct task_mem_deleter - { - void operator()(wchar_t buf[]) - { - if (buf != nullptr) - CoTaskMemFree(buf); - } - }; - using co_task_mem_ptr = std::unique_ptr; -} static std::wstring @@ -235,6 +237,21 @@ convert_utf8_to_utf16(const std::string& s) return out; } +# ifndef WINRT + +namespace +{ + struct task_mem_deleter + { + void operator()(wchar_t buf[]) + { + if (buf != nullptr) + CoTaskMemFree(buf); + } + }; + using co_task_mem_ptr = std::unique_ptr; +} + // We might need to know certain locations even if not using the remote API, // so keep these routines out of that block for now. static @@ -372,7 +389,7 @@ class file_streambuf { # ifdef _WIN32 std::wstring wfilename = convert_utf8_to_utf16(filename); - FILE* file = ::_wfopen(wfilename.c_str(), L"rb"); + FILE* file = ::_wfopen(wfilename.c_str(), L"r"); # else // !_WIN32 FILE* file = ::fopen(filename.c_str(), "rb"); # endif // _WIN32 @@ -388,9 +405,8 @@ class file_streambuf }; #endif // !USE_OS_TZDB -namespace arrow_vendored -{ -namespace date + +namespace arrow_vendored::date { // +---------------------+ // | Begin Configuration | @@ -470,7 +486,18 @@ discover_tz_dir() { struct stat sb; using namespace std; -# ifndef __APPLE__ +# if defined(ANDROID) || defined(__ANDROID__) + CONSTDATA auto tz_dir_default = "/apex/com.android.tzdata/etc/tz"; + CONSTDATA auto tz_dir_fallback = "/system/usr/share/zoneinfo"; + + // Check updatable path first + if(stat(tz_dir_default, &sb) == 0 && S_ISDIR(sb.st_mode)) + return tz_dir_default; + else if(stat(tz_dir_fallback, &sb) == 0 && S_ISDIR(sb.st_mode)) + return tz_dir_fallback; + else + throw runtime_error("discover_tz_dir failed to find zoneinfo\n"); +# elif !defined(__APPLE__) CONSTDATA auto tz_dir_default = "/usr/share/zoneinfo"; CONSTDATA auto tz_dir_buildroot = "/usr/share/zoneinfo/uclibc"; @@ -493,9 +520,10 @@ discover_tz_dir() if (!(lstat(timezone, &sb) == 0 && S_ISLNK(sb.st_mode) && sb.st_size > 0)) throw runtime_error("discover_tz_dir failed\n"); string result; - char rp[PATH_MAX+1] = {}; - if (readlink(timezone, rp, sizeof(rp)-1) > 0) - result = string(rp); + unique_ptr rp(new char[sb.st_size]); + const auto rp_length = readlink(timezone, rp.get(), sb.st_size); + if (rp_length > 0) + result = string(rp.get(), rp_length); // readlink doesn't null-terminate else throw system_error(errno, system_category(), "readlink() failed"); auto i = result.find("zoneinfo"); @@ -527,7 +555,9 @@ get_tz_dir() static_assert(min_year <= max_year, "Configuration error"); #endif +#if !defined(ANDROID) && !defined(__ANDROID__) static std::unique_ptr init_tzdb(); +#endif // !defined(ANDROID) && !defined(__ANDROID__) tzdb_list::~tzdb_list() { @@ -586,31 +616,67 @@ get_tzdb_list() return tz_db; } +#if !defined(ANDROID) && !defined(__ANDROID__) +inline +static +char +tolower(char c) +{ + return static_cast(std::tolower(c)); +} + +inline +static +void +tolower(std::string& s) +{ + for (auto& c : s) + c = tolower(c); +} + +inline static std::string -parse3(std::istream& in) +get_alpha_word(std::istream& in) { - std::string r(3, ' '); ws(in); - r[0] = static_cast(in.get()); - r[1] = static_cast(in.get()); - r[2] = static_cast(in.get()); - return r; + std::string s; + while (!in.eof() && std::isalpha(in.peek())) + s.push_back(static_cast(in.get())); + return s; } +#endif // !defined(ANDROID) && !defined(__ANDROID__) +inline +static +bool +is_prefix_of(std::string const& key, std::string const& value) +{ + const size_t size = std::min(key.size(), value.size()); + return key.compare(0, size, value, 0, size) == 0; +} + +#if !defined(ANDROID) && !defined(__ANDROID__) static unsigned parse_month(std::istream& in) { - CONSTDATA char*const month_names[] = - {"Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; - auto s = parse3(in); - auto m = std::find(std::begin(month_names), std::end(month_names), s) - month_names; + static std::string const month_names[] = + {"january", "february", "march", "april", "may", "june", + "july", "august", "september", "october", "november", "december"}; + auto s = get_alpha_word(in); + tolower(s); + auto m = std::find_if(std::begin(month_names), std::end(month_names), + [&s](std::string const& m) + { + return is_prefix_of(s, m); + }) + - month_names; if (m >= std::end(month_names) - std::begin(month_names)) throw std::runtime_error("oops: bad month name: " + s); return static_cast(++m); } +#endif // !defined(ANDROID) && !defined(__ANDROID__) #if !USE_OS_TZDB @@ -822,10 +888,16 @@ static unsigned parse_dow(std::istream& in) { - CONSTDATA char*const dow_names[] = - {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}; - auto s = parse3(in); - auto dow = std::find(std::begin(dow_names), std::end(dow_names), s) - dow_names; + static std::string const dow_names[] = + {"sunday", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday"}; + auto s = get_alpha_word(in); + tolower(s); + auto dow = std::find_if(std::begin(dow_names), std::end(dow_names), + [&s](std::string const& dow) + { + return is_prefix_of(s, dow); + }) + - dow_names; if (dow >= std::end(dow_names) - std::begin(dow_names)) throw std::runtime_error("oops: bad dow name: " + s); return static_cast(dow); @@ -875,7 +947,7 @@ parse_signed_time(std::istream& in) detail::MonthDayTime::MonthDayTime(local_seconds tp, tz timezone) : zone_(timezone) { - using namespace date; + using namespace arrow_vendored::date; const auto dp = date::floor(tp); const auto hms = make_time(tp - dp); const auto ymd = year_month_day(dp); @@ -969,7 +1041,7 @@ sys_seconds detail::MonthDayTime::to_sys(date::year y, std::chrono::seconds offset, std::chrono::seconds save) const { - using namespace date; + using namespace arrow_vendored::date; using namespace std::chrono; auto until_utc = to_time_point(y); if (zone_ == tz::standard) @@ -1004,7 +1076,7 @@ date::sys_days detail::MonthDayTime::to_sys_days(date::year y) const { using namespace std::chrono; - using namespace date; + using namespace arrow_vendored::date; switch (type_) { case month_day: @@ -1038,7 +1110,7 @@ void detail::MonthDayTime::canonicalize(date::year y) { using namespace std::chrono; - using namespace date; + using namespace arrow_vendored::date; switch (type_) { case month_day: @@ -1076,7 +1148,7 @@ detail::MonthDayTime::canonicalize(date::year y) std::istream& detail::operator>>(std::istream& is, MonthDayTime& x) { - using namespace date; + using namespace arrow_vendored::date; using namespace std::chrono; assert(((std::ios::failbit | std::ios::badbit) & is.exceptions()) == (std::ios::failbit | std::ios::badbit)); @@ -1086,7 +1158,7 @@ detail::operator>>(std::istream& is, MonthDayTime& x) auto m = parse_month(is); if (!is.eof() && ws(is) && !is.eof() && is.peek() != '#') { - if (is.peek() == 'l') + if (tolower(is.peek()) == 'l') { for (int i = 0; i < 4; ++i) is.get(); @@ -1212,7 +1284,7 @@ detail::Rule::Rule(const std::string& s) { try { - using namespace date; + using namespace arrow_vendored::date; using namespace std::chrono; std::istringstream in(s); in.exceptions(std::ios::failbit | std::ios::badbit); @@ -1357,7 +1429,7 @@ detail::operator<(const std::string& x, const Rule& y) std::ostream& detail::operator<<(std::ostream& os, const Rule& r) { - using namespace date; + using namespace arrow_vendored::date; using namespace std::chrono; detail::save_ostream _(os); os.fill(' '); @@ -1416,7 +1488,7 @@ detail::Rule::overlaps(const Rule& x, const Rule& y) void detail::Rule::split(std::vector& rules, std::size_t i, std::size_t k, std::size_t& e) { - using namespace date; + using namespace arrow_vendored::date; using difference_type = std::vector::iterator::difference_type; // rules[i].starting_year_ <= rules[k].starting_year_ && // rules[i].ending_year_ >= rules[k].starting_year_ && @@ -1555,7 +1627,7 @@ static std::pair find_previous_rule(const Rule* r, date::year y) { - using namespace date; + using namespace arrow_vendored::date; auto const& rules = get_tzdb().rules; if (y == r->starting_year()) { @@ -1591,7 +1663,7 @@ static std::pair find_next_rule(const Rule* first_rule, const Rule* last_rule, const Rule* r, date::year y) { - using namespace date; + using namespace arrow_vendored::date; if (y == r->ending_year()) { if (r == last_rule-1) @@ -1622,7 +1694,7 @@ static std::pair find_next_rule(const Rule* r, date::year y) { - using namespace date; + using namespace arrow_vendored::date; auto const& rules = get_tzdb().rules; if (y == r->ending_year()) { @@ -1671,7 +1743,7 @@ find_rule_for_zone(const std::pair& eqr, assert(eqr.second != nullptr); using namespace std::chrono; - using namespace date; + using namespace arrow_vendored::date; auto r = eqr.first; auto ry = r->starting_year(); auto prev_save = minutes{0}; @@ -1697,7 +1769,7 @@ find_rule_for_zone(const std::pair& eqr, const local_seconds& tp_loc) { using namespace std::chrono; - using namespace date; + using namespace arrow_vendored::date; auto r = eqr.first; auto ry = r->starting_year(); auto prev_save = minutes{0}; @@ -1737,7 +1809,7 @@ find_rule(const std::pair& first_rule, const std::string& initial_abbrev) { using namespace std::chrono; - using namespace date; + using namespace arrow_vendored::date; auto r = first_rule.first; auto ry = first_rule.second; sys_info x{sys_days(year::min()/min_day), sys_days(year::max()/max_day), @@ -2152,6 +2224,9 @@ time_zone::load_data(std::istream& inf, void time_zone::init_impl() { +#if defined(ANDROID) || defined(__ANDROID__) + return; +#endif // defined(ANDROID) || defined(__ANDROID__) using namespace std; using namespace std::chrono; auto name = get_tz_dir() + ('/' + name_); @@ -2313,6 +2388,86 @@ time_zone::get_info_impl(local_seconds tp) const return i; } +#if defined(ANDROID) || defined(__ANDROID__) +void +time_zone::parse_from_android_tzdata(std::ifstream& inf, const std::size_t off) +{ + using namespace std; + using namespace std::chrono; + if (!inf.is_open()) + throw std::runtime_error{"Unable to open tzdata"}; + std::size_t restorepos = inf.tellg(); + inf.seekg(off, inf.beg); + load_header(inf); + auto v = load_version(inf); + std::int32_t tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt, + tzh_timecnt, tzh_typecnt, tzh_charcnt; + skip_reserve(inf); + load_counts(inf, tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt, + tzh_timecnt, tzh_typecnt, tzh_charcnt); + if (v == 0) + { + load_data(inf, tzh_leapcnt, tzh_timecnt, tzh_typecnt, tzh_charcnt); + } + else + { +#if !defined(NDEBUG) + inf.ignore((4+1)*tzh_timecnt + 6*tzh_typecnt + tzh_charcnt + 8*tzh_leapcnt + + tzh_ttisstdcnt + tzh_ttisgmtcnt); + load_header(inf); + auto v2 = load_version(inf); + assert(v == v2); + skip_reserve(inf); +#else // defined(NDEBUG) + inf.ignore((4+1)*tzh_timecnt + 6*tzh_typecnt + tzh_charcnt + 8*tzh_leapcnt + + tzh_ttisstdcnt + tzh_ttisgmtcnt + (4+1+15)); +#endif // defined(NDEBUG) + load_counts(inf, tzh_ttisgmtcnt, tzh_ttisstdcnt, tzh_leapcnt, + tzh_timecnt, tzh_typecnt, tzh_charcnt); + load_data(inf, tzh_leapcnt, tzh_timecnt, tzh_typecnt, tzh_charcnt); + } +#if !MISSING_LEAP_SECONDS + if (tzh_leapcnt > 0) + { + auto& leap_seconds = get_tzdb_list().front().leap_seconds; + auto itr = leap_seconds.begin(); + auto l = itr->date(); + seconds leap_count{0}; + for (auto t = std::upper_bound(transitions_.begin(), transitions_.end(), l, + [](const sys_seconds& x, const transition& ct) + { + return x < ct.timepoint; + }); + t != transitions_.end(); ++t) + { + while (t->timepoint >= l) + { + ++leap_count; + if (++itr == leap_seconds.end()) + l = sys_days(max_year/max_day); + else + l = itr->date() + leap_count; + } + t->timepoint -= leap_count; + } + } +#endif // !MISSING_LEAP_SECONDS + auto b = transitions_.begin(); + auto i = transitions_.end(); + if (i != b) + { + for (--i; i != b; --i) + { + if (i->info->offset == i[-1].info->offset && + i->info->abbrev == i[-1].info->abbrev && + i->info->is_dst == i[-1].info->is_dst) + i = transitions_.erase(i); + } + } + inf.seekg(restorepos, inf.beg); +} +#endif // defined(ANDROID) || defined(__ANDROID__) + std::ostream& operator<<(std::ostream& os, const time_zone& z) { @@ -2346,7 +2501,7 @@ time_zone::time_zone(const std::string& s, detail::undocumented) { try { - using namespace date; + using namespace arrow_vendored::date; std::istringstream in(s); in.exceptions(std::ios::failbit | std::ios::badbit); std::string word; @@ -2416,7 +2571,7 @@ time_zone::add(const std::string& s) void time_zone::parse_info(std::istream& in) { - using namespace date; + using namespace arrow_vendored::date; using namespace std::chrono; zonelets_.emplace_back(); auto& zonelet = zonelets_.back(); @@ -2449,7 +2604,7 @@ void time_zone::adjust_infos(const std::vector& rules) { using namespace std::chrono; - using namespace date; + using namespace arrow_vendored::date; const zonelet* prev_zonelet = nullptr; for (auto& z : zonelets_) { @@ -2620,7 +2775,7 @@ sys_info time_zone::get_info_impl(sys_seconds tp, int tz_int) const { using namespace std::chrono; - using namespace date; + using namespace arrow_vendored::date; tz timezone = static_cast(tz_int); assert(timezone != tz::standard); auto y = year_month_day(floor(tp)).year(); @@ -2682,7 +2837,7 @@ time_zone::get_info_impl(sys_seconds tp, int tz_int) const std::ostream& operator<<(std::ostream& os, const time_zone& z) { - using namespace date; + using namespace arrow_vendored::date; using namespace std::chrono; detail::save_ostream _(os); os.fill(' '); @@ -2744,14 +2899,14 @@ operator<<(std::ostream& os, const leap_second& x) #if USE_OS_TZDB +#if !defined(ANDROID) && !defined(__ANDROID__) static std::string get_version() { - using namespace std; - auto path = get_tz_dir() + string("/+VERSION"); - ifstream in{path}; - string version; + auto path = get_tz_dir() + std::string("/+VERSION"); + std::ifstream in{path}; + std::string version; if (in) { in >> version; @@ -2786,7 +2941,8 @@ find_read_and_leap_seconds() iss.exceptions(std::ios::failbit | std::ios::badbit); std::string word; iss >> word; - if (word == "Leap") + tolower(word); + if (is_prefix_of(word, "leap")) { int y, m, d; iss >> y; @@ -2847,6 +3003,7 @@ find_read_and_leap_seconds() #endif return {}; } +#endif // !defined(ANDROID) && !defined(__ANDROID__) static std::unique_ptr @@ -2854,6 +3011,38 @@ init_tzdb() { std::unique_ptr db(new tzdb); +#if defined(ANDROID) || defined(__ANDROID__) + auto path = get_tz_dir() + std::string("/tzdata"); + std::ifstream in{path}; + if (!in) + throw std::runtime_error("Can not open " + path); + bionic_tzdata_header_t hdr{}; + in.read(reinterpret_cast(&hdr), sizeof(bionic_tzdata_header_t)); + if (!is_prefix_of(hdr.tzdata_version, "tzdata") || hdr.tzdata_version[11] != 0) + throw std::runtime_error("Malformed tzdata - invalid magic!"); + maybe_reverse_bytes(hdr.index_offset); + maybe_reverse_bytes(hdr.data_offset); + maybe_reverse_bytes(hdr.final_offset); + if (hdr.index_offset > hdr.data_offset) + throw std::runtime_error("Malformed tzdata - hdr.index_offset > hdr.data_offset!"); + const size_t index_size = hdr.data_offset - hdr.index_offset; + if ((index_size % sizeof(index_entry_t)) != 0) + throw std::runtime_error("Malformed tzdata - index size malformed!"); + //Iterate through zone index + index_entry_t index_entry{}; + for (size_t idx = 0; idx < index_size; idx += sizeof(index_entry_t)) { + in.read(reinterpret_cast(&index_entry), sizeof(index_entry_t)); + maybe_reverse_bytes(index_entry.start); + maybe_reverse_bytes(index_entry.length); + time_zone timezone{std::string(index_entry.buf), + detail::undocumented{}}; + timezone.parse_from_android_tzdata(in, hdr.data_offset + index_entry.start); + db->zones.emplace_back(std::move(timezone)); + } + db->zones.shrink_to_fit(); + std::sort(db->zones.begin(), db->zones.end()); + db->version = std::string(hdr.tzdata_version).replace(0, 6, ""); +#else //Iterate through folders std::queue subfolders; subfolders.emplace(get_tz_dir()); @@ -2878,6 +3067,7 @@ init_tzdb() strcmp(d->d_name, "version") == 0 || strcmp(d->d_name, "zone.tab") == 0 || strcmp(d->d_name, "zone1970.tab") == 0 || + strcmp(d->d_name, "zonenow.tab") == 0 || strcmp(d->d_name, "tzdata.zi") == 0 || strcmp(d->d_name, "leapseconds") == 0 || strcmp(d->d_name, "leap-seconds.list") == 0 ) @@ -2905,6 +3095,7 @@ init_tzdb() std::sort(db->zones.begin(), db->zones.end()); db->leap_seconds = find_read_and_leap_seconds(); db->version = get_version(); +#endif // defined(ANDROID) || defined(__ANDROID__) return db; } @@ -2914,7 +3105,7 @@ init_tzdb() time_zone_link::time_zone_link(const std::string& s) { - using namespace date; + using namespace arrow_vendored::date; std::istringstream in(s); in.exceptions(std::ios::failbit | std::ios::badbit); std::string word; @@ -2924,7 +3115,7 @@ time_zone_link::time_zone_link(const std::string& s) std::ostream& operator<<(std::ostream& os, const time_zone_link& x) { - using namespace date; + using namespace arrow_vendored::date; detail::save_ostream _(os); os.fill(' '); os.flags(std::ios::dec | std::ios::left); @@ -2936,7 +3127,7 @@ operator<<(std::ostream& os, const time_zone_link& x) leap_second::leap_second(const std::string& s, detail::undocumented) { - using namespace date; + using namespace arrow_vendored::date; std::istringstream in(s); in.exceptions(std::ios::failbit | std::ios::badbit); std::string word; @@ -3568,7 +3759,7 @@ static std::unique_ptr init_tzdb() { - using namespace date; + using namespace arrow_vendored::date; const std::string install = get_install(); const std::string path = install + folder_delimiter; std::string line; @@ -3647,22 +3838,23 @@ init_tzdb() std::istringstream in(line); std::string word; in >> word; - if (word == "Rule") + tolower(word); + if (is_prefix_of(word, "rule")) { db->rules.push_back(Rule(line)); continue_zone = false; } - else if (word == "Link") + else if (is_prefix_of(word, "link")) { db->links.push_back(time_zone_link(line)); continue_zone = false; } - else if (word == "Leap") + else if (is_prefix_of(word, "leap")) { db->leap_seconds.push_back(leap_second(line, detail::undocumented{})); continue_zone = false; } - else if (word == "Zone") + else if (is_prefix_of(word, "zone")) { db->zones.push_back(time_zone(line, detail::undocumented{})); continue_zone = true; @@ -3991,10 +4183,12 @@ bool sniff_realpath(const char* timezone) { using namespace std; - char rp[PATH_MAX+1] = {}; - if (realpath(timezone, rp) == nullptr) + unique_ptr rp(realpath(timezone, nullptr), free); + if (rp.get() == nullptr) throw system_error(errno, system_category(), "realpath() failed"); - auto result = extract_tz_name(rp); + auto result = extract_tz_name(rp.get()); + if (result.find("posix") == 0) + return false; return result != "posixrules"; } @@ -4021,18 +4215,24 @@ tzdb::current_zone() const { using namespace std; static const bool use_realpath = sniff_realpath(timezone); - char rp[PATH_MAX+1] = {}; if (use_realpath) { - if (realpath(timezone, rp) == nullptr) + unique_ptr rp(realpath(timezone, nullptr), free); + if (rp.get() == nullptr) throw system_error(errno, system_category(), "realpath() failed"); + return locate_zone(extract_tz_name(rp.get())); } else { - if (readlink(timezone, rp, sizeof(rp)-1) <= 0) + // +1 because st_size doesn't include the '\0' terminator + const auto rp_size = sb.st_size + 1; + unique_ptr rp(new char[rp_size]); + const auto rp_length = readlink(timezone, rp.get(), rp_size); + if (rp_length <= 0) throw system_error(errno, system_category(), "readlink() failed"); + rp.get()[rp_length] = '\0'; // readlink doesn't null-terminate + return locate_zone(extract_tz_name(rp.get())); } - return locate_zone(extract_tz_name(rp)); } } // On embedded systems e.g. buildroot with uclibc the timezone is linked @@ -4051,9 +4251,10 @@ tzdb::current_zone() const if (lstat(timezone, &sb) == 0 && S_ISLNK(sb.st_mode) && sb.st_size > 0) { using namespace std; string result; - char rp[PATH_MAX+1] = {}; - if (readlink(timezone, rp, sizeof(rp)-1) > 0) - result = string(rp); + unique_ptr rp(new char[sb.st_size]); + const auto rp_length = readlink(timezone, rp.get(), sb.st_size); + if (rp_length > 0) + result = string(rp.get(), rp_length); // readlink doesn't null-terminate else throw system_error(errno, system_category(), "readlink() failed"); @@ -4135,6 +4336,25 @@ tzdb::current_zone() const } // Fall through to try other means. } + // On OpenWRT we need to check /etc/config/system + // It will have a line with the following structure + // ... + // option zoneName 'Europe/Berlin' + // ... + { + std::ifstream timezone_file("/etc/config/system"); + if (timezone_file.is_open()) + { + for(std::string result; std::getline(timezone_file, result);) { + std::string findStr = "option zoneName '"; + size_t startPos = result.find(findStr); + if (startPos != std::string::npos) { + size_t endPos = result.find("'", startPos + findStr.size()); + return locate_zone(result.substr(startPos + findStr.size(), endPos - startPos - findStr.size())); + } + } + } + } throw std::runtime_error("Could not get current timezone"); } @@ -4146,8 +4366,7 @@ current_zone() return get_tzdb().current_zone(); } -} // namespace date -} // namespace arrow_vendored +} // namespace arrow_vendored::date #if defined(__GNUC__) && __GNUC__ < 5 # pragma GCC diagnostic pop diff --git a/cpp/src/arrow/vendored/datetime/tz.h b/cpp/src/arrow/vendored/datetime/tz.h index df6d1a851ac9d..61ab3df106db0 100644 --- a/cpp/src/arrow/vendored/datetime/tz.h +++ b/cpp/src/arrow/vendored/datetime/tz.h @@ -43,19 +43,13 @@ // required. On Windows, the names are never "Standard" so mapping is always required. // Technically any OS may use the mapping process but currently only Windows does use it. -// NOTE(ARROW): If this is not set, then the library will attempt to -// use libcurl to obtain a timezone database, and we probably do not want this. -#ifndef _WIN32 -#define USE_OS_TZDB 1 -#endif - #ifndef USE_OS_TZDB # define USE_OS_TZDB 0 #endif #ifndef HAS_REMOTE_API # if USE_OS_TZDB == 0 -# ifdef _WIN32 +# if defined _WIN32 || defined __ANDROID__ # define HAS_REMOTE_API 0 # else # define HAS_REMOTE_API 1 @@ -140,13 +134,18 @@ static_assert(HAS_REMOTE_API == 0 ? AUTO_DOWNLOAD == 0 : true, # endif #endif -namespace arrow_vendored -{ -namespace date +namespace arrow_vendored::date { enum class choose {earliest, latest}; +#if defined(BUILD_TZ_LIB) +# if defined(ANDROID) || defined(__ANDROID__) +struct tzdb; +static std::unique_ptr init_tzdb(); +# endif // defined(ANDROID) || defined(__ANDROID__) +#endif // defined(BUILD_TZ_LIB) + namespace detail { struct undocumented; @@ -829,6 +828,12 @@ class time_zone #if !USE_OS_TZDB DATE_API void add(const std::string& s); +#else +# if defined(BUILD_TZ_LIB) +# if defined(ANDROID) || defined(__ANDROID__) + friend std::unique_ptr init_tzdb(); +# endif // defined(ANDROID) || defined(__ANDROID__) +# endif // defined(BUILD_TZ_LIB) #endif // !USE_OS_TZDB private: @@ -852,6 +857,9 @@ class time_zone DATE_API void load_data(std::istream& inf, std::int32_t tzh_leapcnt, std::int32_t tzh_timecnt, std::int32_t tzh_typecnt, std::int32_t tzh_charcnt); +# if defined(ANDROID) || defined(__ANDROID__) + void parse_from_android_tzdata(std::ifstream& inf, const std::size_t off); +# endif // defined(ANDROID) || defined(__ANDROID__) #else // !USE_OS_TZDB DATE_API sys_info get_info_impl(sys_seconds tp, int tz_int) const; DATE_API void adjust_infos(const std::vector& rules); @@ -1198,11 +1206,11 @@ struct tzdb #endif // defined(_MSC_VER) && (_MSC_VER < 1900) #if HAS_STRING_VIEW - const time_zone* locate_zone(std::string_view tz_name) const; + DATE_API const time_zone* locate_zone(std::string_view tz_name) const; #else - const time_zone* locate_zone(const std::string& tz_name) const; + DATE_API const time_zone* locate_zone(const std::string& tz_name) const; #endif - const time_zone* current_zone() const; + DATE_API const time_zone* current_zone() const; }; using TZ_DB = tzdb; @@ -1217,9 +1225,9 @@ class tzdb_list std::atomic head_{nullptr}; public: - ~tzdb_list(); + DATE_API ~tzdb_list(); tzdb_list() = default; - tzdb_list(tzdb_list&& x) NOEXCEPT; + DATE_API tzdb_list(tzdb_list&& x) NOEXCEPT; const tzdb& front() const NOEXCEPT {return *head_;} tzdb& front() NOEXCEPT {return *head_;} @@ -1232,7 +1240,7 @@ class tzdb_list const_iterator cbegin() const NOEXCEPT; const_iterator cend() const NOEXCEPT; - const_iterator erase_after(const_iterator p) NOEXCEPT; + DATE_API const_iterator erase_after(const_iterator p) NOEXCEPT; struct undocumented_helper; private: @@ -2795,7 +2803,6 @@ to_gps_time(const tai_time& t) return gps_clock::from_utc(tai_clock::to_utc(t)); } -} // namespace date -} // namespace arrow_vendored +} // namespace arrow_vendored::date #endif // TZ_H diff --git a/cpp/src/arrow/vendored/datetime/tz_private.h b/cpp/src/arrow/vendored/datetime/tz_private.h index a6bb8fd30a0c7..1d7f858971106 100644 --- a/cpp/src/arrow/vendored/datetime/tz_private.h +++ b/cpp/src/arrow/vendored/datetime/tz_private.h @@ -34,9 +34,7 @@ #include #endif -namespace arrow_vendored -{ -namespace date +namespace arrow_vendored::date { namespace detail @@ -308,8 +306,7 @@ struct transition } // namespace detail -} // namespace date -} // namespace arrow_vendored +} // namespace arrow_vendored::date #if defined(_MSC_VER) && (_MSC_VER < 1900) #include "tz.h" diff --git a/cpp/src/arrow/vendored/datetime/update.sh b/cpp/src/arrow/vendored/datetime/update.sh new file mode 100755 index 0000000000000..b4580c042608e --- /dev/null +++ b/cpp/src/arrow/vendored/datetime/update.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -eux + +source_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 VERSION" + echo " e.g.: $0 3.0.3" + exit 1 +fi + +version="$1" + +pushd "${source_dir}" +rm -rf date +git clone \ + --branch "v${version}" \ + --depth 1 \ + https://github.com/HowardHinnant/date.git +commit_id=$(git -C date log -1 --format=format:%H) +mv date/include/date/date.h ./ +mv date/include/date/ios.h ./ +mv date/include/date/tz.h ./ +mv date/include/date/tz_private.h ./ +mv date/src/* ./ +rm -rf date +sed -i.bak -E \ + -e 's/namespace date/namespace arrow_vendored::date/g' \ + -e 's,include "date/,include ",g' \ + *.{cpp,h,mm} +sed -i.bak -E \ + -e "s/changeset [0-9a-f]+/changeset ${commit_id}/g" \ + README.md +rm *.bak +popd diff --git a/cpp/src/arrow/vendored/datetime/visibility.h b/cpp/src/arrow/vendored/datetime/visibility.h index ae031238d85ac..780c00d70bd9f 100644 --- a/cpp/src/arrow/vendored/datetime/visibility.h +++ b/cpp/src/arrow/vendored/datetime/visibility.h @@ -17,10 +17,14 @@ #pragma once +#ifndef _WIN32 +# define USE_OS_TZDB 1 +#endif + #if defined(ARROW_STATIC) // intentially empty #elif defined(ARROW_EXPORTING) -#define DATE_BUILD_DLL +# define DATE_BUILD_DLL #else -#define DATE_USE_DLL +# define DATE_USE_DLL #endif diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index c2bc7fc02797e..e1427e25fb666 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -63,7 +63,7 @@ add_gandiva_test(precompiled-test time.cc timestamp_arithmetic.cc ../cast_time.cc - ../../arrow/vendored/datetime/tz.cpp + ../../arrow/vendored/datetime.cpp hash_test.cc hash.cc string_ops_test.cc diff --git a/csharp/Apache.Arrow.sln b/csharp/Apache.Arrow.sln index 7e7f7c6331e88..0e569de1d6c8f 100644 --- a/csharp/Apache.Arrow.sln +++ b/csharp/Apache.Arrow.sln @@ -27,6 +27,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Apache.Arrow.Flight.Sql.Tes EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Apache.Arrow.Flight.Sql", "src\Apache.Arrow.Flight.Sql\Apache.Arrow.Flight.Sql.csproj", "{2ADE087A-B424-4895-8CC5-10170D10BA62}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Apache.Arrow.Flight.IntegrationTest", "test\Apache.Arrow.Flight.IntegrationTest\Apache.Arrow.Flight.IntegrationTest.csproj", "{7E66CBB4-D921-41E7-A98A-7C6DEA521696}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -81,6 +83,10 @@ Global {2ADE087A-B424-4895-8CC5-10170D10BA62}.Debug|Any CPU.Build.0 = Debug|Any CPU {2ADE087A-B424-4895-8CC5-10170D10BA62}.Release|Any CPU.ActiveCfg = Release|Any CPU {2ADE087A-B424-4895-8CC5-10170D10BA62}.Release|Any CPU.Build.0 = Release|Any CPU + {7E66CBB4-D921-41E7-A98A-7C6DEA521696}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7E66CBB4-D921-41E7-A98A-7C6DEA521696}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7E66CBB4-D921-41E7-A98A-7C6DEA521696}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7E66CBB4-D921-41E7-A98A-7C6DEA521696}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj index ec438fde843f4..6bd8b638c7ea0 100644 --- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj +++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 4c8af5b0bbaa3..f15e127a30557 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamWriter.cs b/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamWriter.cs index f76f08224541f..7a8a6fd677c68 100644 --- a/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamWriter.cs +++ b/csharp/src/Apache.Arrow.Flight/FlightRecordBatchStreamWriter.cs @@ -64,7 +64,7 @@ protected virtual void Dispose(bool disposing) { if (!_disposed) { - _flightDataStream.Dispose(); + _flightDataStream?.Dispose(); _disposed = true; } } diff --git a/csharp/src/Apache.Arrow.Flight/Internal/RecordBatchReaderImplementation.cs b/csharp/src/Apache.Arrow.Flight/Internal/RecordBatchReaderImplementation.cs index 99876bf769dc7..22d0bd84fef77 100644 --- a/csharp/src/Apache.Arrow.Flight/Internal/RecordBatchReaderImplementation.cs +++ b/csharp/src/Apache.Arrow.Flight/Internal/RecordBatchReaderImplementation.cs @@ -69,42 +69,43 @@ public override void ReadSchema() public override async ValueTask ReadSchemaAsync(CancellationToken cancellationToken) { - if (HasReadSchema) + while (!HasReadSchema) { - return; - } - - var moveNextResult = await _flightDataStream.MoveNext(cancellationToken).ConfigureAwait(false); - - if (!moveNextResult) - { - throw new Exception("No records or schema in this flight"); - } + var moveNextResult = await _flightDataStream.MoveNext(cancellationToken).ConfigureAwait(false); + if (!moveNextResult) + { + throw new Exception("No records or schema in this flight"); + } - //AppMetadata will never be null, but length 0 if empty - //Those are skipped - if(_flightDataStream.Current.AppMetadata.Length > 0) - { - _applicationMetadatas.Add(_flightDataStream.Current.AppMetadata); - } + if (_flightDescriptor == null && _flightDataStream.Current.FlightDescriptor != null) + { + _flightDescriptor = new FlightDescriptor(_flightDataStream.Current.FlightDescriptor); + } - var header = _flightDataStream.Current.DataHeader.Memory; - Message message = Message.GetRootAsMessage( - ArrowReaderImplementation.CreateByteBuffer(header)); + // AppMetadata will never be null, but length 0 if empty + // Those are skipped + if(_flightDataStream.Current.AppMetadata.Length > 0) + { + _applicationMetadatas.Add(_flightDataStream.Current.AppMetadata); + } + var header = _flightDataStream.Current.DataHeader.Memory; + if (header.IsEmpty) + { + // Clients may send a first message with a descriptor only and no schema + continue; + } - if(_flightDataStream.Current.FlightDescriptor != null) - { - _flightDescriptor = new FlightDescriptor(_flightDataStream.Current.FlightDescriptor); - } + Message message = Message.GetRootAsMessage(ArrowReaderImplementation.CreateByteBuffer(header)); - switch (message.HeaderType) - { - case MessageHeader.Schema: - _schema = FlightMessageSerializer.DecodeSchema(message.ByteBuffer); - break; - default: - throw new Exception($"Expected schema as the first message, but got: {message.HeaderType.ToString()}"); + switch (message.HeaderType) + { + case MessageHeader.Schema: + _schema = FlightMessageSerializer.DecodeSchema(message.ByteBuffer); + break; + default: + throw new Exception($"Expected schema as the first message, but got: {message.HeaderType.ToString()}"); + } } } diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/Apache.Arrow.Flight.IntegrationTest.csproj b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Apache.Arrow.Flight.IntegrationTest.csproj new file mode 100644 index 0000000000000..34030621b4bde --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Apache.Arrow.Flight.IntegrationTest.csproj @@ -0,0 +1,18 @@ + + + + + Exe + net8.0 + Apache.Arrow.Flight.IntegrationTest + + + + + + + + + + + diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/FlightClientCommand.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/FlightClientCommand.cs new file mode 100644 index 0000000000000..a26bcf07eca49 --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/FlightClientCommand.cs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.IO; +using System.Threading.Tasks; +using Apache.Arrow.Flight.IntegrationTest.Scenarios; + +namespace Apache.Arrow.Flight.IntegrationTest; + +public class FlightClientCommand +{ + private readonly int _port; + private readonly string _scenario; + private readonly FileInfo _jsonFileInfo; + + public FlightClientCommand(int port, string scenario, FileInfo jsonFileInfo) + { + _port = port; + _scenario = scenario; + _jsonFileInfo = jsonFileInfo; + } + + public async Task Execute() + { + IScenario scenario = _scenario switch + { + null => new JsonTestScenario(_jsonFileInfo), + "do_exchange:echo" => new DoExchangeEchoScenario(), + _ => throw new NotSupportedException($"Scenario '{_scenario}' is not supported"), + }; + + await scenario.RunClient(_port).ConfigureAwait(false); + } +} diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/FlightServerCommand.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/FlightServerCommand.cs new file mode 100644 index 0000000000000..38f14b789974d --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/FlightServerCommand.cs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Net; +using System.Threading.Tasks; +using Apache.Arrow.Flight.IntegrationTest.Scenarios; +using Apache.Arrow.Flight.Server; +using Apache.Arrow.Flight.TestWeb; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Hosting.Server; +using Microsoft.AspNetCore.Hosting.Server.Features; +using Microsoft.AspNetCore.Server.Kestrel.Core; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Console; + +namespace Apache.Arrow.Flight.IntegrationTest; + +public class FlightServerCommand +{ + private readonly string _scenario; + + public FlightServerCommand(string scenario) + { + _scenario = scenario; + } + + public async Task Execute() + { + IScenario scenario = _scenario switch + { + null => null, + "do_exchange:echo" => new DoExchangeEchoScenario(), + _ => throw new NotSupportedException($"Scenario {_scenario} is not supported") + }; + + var host = Host.CreateDefaultBuilder() + .ConfigureWebHostDefaults(webBuilder => + { + webBuilder + .ConfigureKestrel(options => + { + options.Listen(IPEndPoint.Parse("127.0.0.1:0"), l => l.Protocols = HttpProtocols.Http2); + }) + .ConfigureServices(services => + { + if (scenario == null) + { + // Use the TestFlightServer for JSON based integration tests + services.AddGrpc().AddFlightServer(); + services.AddSingleton(new FlightStore()); + } + else + { + // Use a scenario-specific server implementation + services.AddGrpc().Services.AddScoped(_ => scenario.MakeServer()); + } + + // The integration tests rely on the port being written to the first line of stdout, + // so send all logging to stderr. + services.Configure( + o => o.LogToStandardErrorThreshold = LogLevel.Debug); + + }) + .UseStartup(); + }) + .Build(); + + await host.StartAsync().ConfigureAwait(false); + + var addresses = host.Services.GetService().Features.Get().Addresses; + foreach (var address in addresses) + { + Console.WriteLine($"Server listening on {address}"); + } + + await host.WaitForShutdownAsync().ConfigureAwait(false); + } +} diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/GrpcResolver.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/GrpcResolver.cs new file mode 100644 index 0000000000000..44b1075e7abf2 --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/GrpcResolver.cs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Grpc.Net.Client.Balancer; + +namespace Apache.Arrow.Flight.IntegrationTest; + +/// +/// The Grpc.Net.Client library doesn't know how to handle the "grpc+tcp" scheme used by Arrow Flight. +/// This ResolverFactory passes these through to the standard Static Resolver used for the http scheme. +/// +public class GrpcTcpResolverFactory : ResolverFactory +{ + public override string Name => "grpc+tcp"; + + public override Resolver Create(ResolverOptions options) + { + return new StaticResolverFactory( + uri => new[] { new BalancerAddress(options.Address.Host, options.Address.Port) }) + .Create(options); + } +} diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/IScenario.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/IScenario.cs new file mode 100644 index 0000000000000..41ed631f33528 --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/IScenario.cs @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Threading.Tasks; +using Apache.Arrow.Flight.Server; + +namespace Apache.Arrow.Flight.IntegrationTest; + +/// +/// A Flight integration test scenario +/// +internal interface IScenario +{ + /// + /// Create a FlightServer instance to run the scenario + /// + FlightServer MakeServer(); + + /// + /// Run the scenario using a Flight client + /// + Task RunClient(int serverPort); +} diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/Program.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Program.cs new file mode 100644 index 0000000000000..24d39de28a731 --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Program.cs @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.CommandLine; +using System.IO; +using System.Threading.Tasks; + +namespace Apache.Arrow.Flight.IntegrationTest; + +public static class Program +{ + public static async Task Main(string[] args) + { + var portOption = new Option( + new[] { "--port", "-p" }, + description: "Port the Flight server is listening on"); + var scenarioOption = new Option( + new[] { "--scenario", "-s" }, + "The name of the scenario to run"); + var pathOption = new Option( + new[] { "--path", "-j" }, + "Path to a JSON file of test data"); + + var rootCommand = new RootCommand( + "Integration test application for Apache.Arrow .NET Flight."); + + var clientCommand = new Command("client", "Run the Flight client") + { + portOption, + scenarioOption, + pathOption, + }; + rootCommand.AddCommand(clientCommand); + + clientCommand.SetHandler(async (port, scenario, jsonFile) => + { + var command = new FlightClientCommand(port, scenario, jsonFile); + await command.Execute().ConfigureAwait(false); + }, portOption, scenarioOption, pathOption); + + var serverCommand = new Command("server", "Run the Flight server") + { + scenarioOption, + }; + rootCommand.AddCommand(serverCommand); + + serverCommand.SetHandler(async scenario => + { + var command = new FlightServerCommand(scenario); + await command.Execute().ConfigureAwait(false); + }, scenarioOption); + + return await rootCommand.InvokeAsync(args).ConfigureAwait(false); + } +} diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/Scenarios/DoExchangeEchoScenario.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Scenarios/DoExchangeEchoScenario.cs new file mode 100644 index 0000000000000..6e9b2696bbfb9 --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Scenarios/DoExchangeEchoScenario.cs @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Apache.Arrow.Flight.Client; +using Apache.Arrow.Flight.Server; +using Google.Protobuf; +using Grpc.Core; +using Grpc.Net.Client; +using Grpc.Net.Client.Balancer; +using Microsoft.Extensions.DependencyInjection; + +namespace Apache.Arrow.Flight.IntegrationTest.Scenarios; + +internal class DoExchangeServer : FlightServer +{ + public override async Task DoExchange( + FlightServerRecordBatchStreamReader requestStream, + FlightServerRecordBatchStreamWriter responseStream, + ServerCallContext context) + { + var descriptor = await requestStream.FlightDescriptor; + var command = descriptor.Command?.ToStringUtf8(); + if (command != "echo") + { + throw new Exception($"Unsupported command: '{command}'"); + } + + while (await requestStream.MoveNext()) + { + await responseStream.WriteAsync( + requestStream.Current, requestStream.ApplicationMetadata.FirstOrDefault()); + } + } +} + +internal class DoExchangeEchoScenario : IScenario +{ + public FlightServer MakeServer() => new DoExchangeServer(); + + public async Task RunClient(int serverPort) + { + var services = new ServiceCollection(); + services.AddSingleton(new GrpcTcpResolverFactory()); + var serviceProvider = services.BuildServiceProvider(); + + var address = $"grpc+tcp://localhost:{serverPort}"; + using var channel = GrpcChannel.ForAddress( + address, + new GrpcChannelOptions + { + ServiceProvider = serviceProvider, + Credentials = ChannelCredentials.Insecure + }); + + var client = new FlightClient(channel); + var descriptor = FlightDescriptor.CreateCommandDescriptor("echo"); + using var exchange = client.DoExchange(descriptor); + + using var writer = exchange.RequestStream; + using var reader = exchange.ResponseStream; + + for (var batchIdx = 0; batchIdx < 4; batchIdx++) + { + using var batch = new RecordBatch.Builder() + .Append( + "x", + nullable: false, + array: new Int32Array.Builder().AppendRange(Enumerable.Range(batchIdx, 10)).Build()) + .Build(); + + var expectedMetadata = $"{batchIdx}"; + var writeMetadata = batchIdx % 2 == 0; + if (writeMetadata) + { + await writer.WriteAsync(batch, ByteString.CopyFromUtf8(expectedMetadata)); + } + else + { + await writer.WriteAsync(batch); + } + + if (!await reader.MoveNext(CancellationToken.None)) + { + throw new Exception("Unexpected end of read stream"); + } + + var readMetadata = reader.ApplicationMetadata?.FirstOrDefault()?.ToStringUtf8(); + + if (writeMetadata && readMetadata != expectedMetadata) + { + throw new Exception($"Expected metadata '{expectedMetadata}' but received '{readMetadata}'"); + } + if (!writeMetadata && readMetadata != null) + { + throw new Exception($"Unexpected metadata received: '{readMetadata}'"); + } + } + + await writer.CompleteAsync(); + + if (await reader.MoveNext(CancellationToken.None)) + { + throw new Exception("Expected end of read stream"); + } + } +} diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/Scenarios/JsonTestScenario.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Scenarios/JsonTestScenario.cs new file mode 100644 index 0000000000000..4f7fed74352fc --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Scenarios/JsonTestScenario.cs @@ -0,0 +1,176 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using Apache.Arrow.Flight.Client; +using Apache.Arrow.Flight.Server; +using Apache.Arrow.IntegrationTest; +using Apache.Arrow.Tests; +using Apache.Arrow.Types; +using Google.Protobuf; +using Grpc.Net.Client; +using Grpc.Core; +using Grpc.Net.Client.Balancer; +using Microsoft.Extensions.DependencyInjection; + +namespace Apache.Arrow.Flight.IntegrationTest.Scenarios; + +/// +/// A test scenario defined using a JSON data file +/// +internal class JsonTestScenario : IScenario +{ + private readonly FileInfo _jsonFile; + private readonly ServiceProvider _serviceProvider; + + public JsonTestScenario(FileInfo jsonFile) + { + if (!(jsonFile?.Exists ?? false)) + { + throw new Exception($"Invalid JSON file path '{jsonFile?.FullName}'"); + } + + _jsonFile = jsonFile; + + var services = new ServiceCollection(); + services.AddSingleton(new GrpcTcpResolverFactory()); + _serviceProvider = services.BuildServiceProvider(); + } + + public FlightServer MakeServer() + { + throw new NotImplementedException(); + } + + public async Task RunClient(int serverPort) + { + var address = $"grpc+tcp://localhost:{serverPort}"; + using var channel = GrpcChannel.ForAddress( + address, + new GrpcChannelOptions + { + ServiceProvider = _serviceProvider, + Credentials = ChannelCredentials.Insecure + }); + var client = new FlightClient(channel); + + var descriptor = FlightDescriptor.CreatePathDescriptor(_jsonFile.FullName); + + var jsonFile = await JsonFile.ParseAsync(_jsonFile).ConfigureAwait(false); + var schema = jsonFile.GetSchemaAndDictionaries(out Func dictionaries); + var batches = jsonFile.Batches.Select(batch => batch.ToArrow(schema, dictionaries)).ToArray(); + + // 1. Put the data to the server. + await UploadBatches(client, descriptor, batches).ConfigureAwait(false); + + // 2. Get the ticket for the data. + var info = await client.GetInfo(descriptor).ConfigureAwait(false); + if (info.Endpoints.Count == 0) + { + throw new Exception("No endpoints received"); + } + + // 3. Stream data from the server, comparing individual batches. + foreach (var endpoint in info.Endpoints) + { + var locations = endpoint.Locations.ToArray(); + if (locations.Length == 0) + { + // Can read with existing client + await ConsumeFlightLocation(client, endpoint.Ticket, batches).ConfigureAwait(false); + } + else + { + foreach (var location in locations) + { + using var readChannel = GrpcChannel.ForAddress( + location.Uri, + new GrpcChannelOptions + { + ServiceProvider = _serviceProvider, + Credentials = ChannelCredentials.Insecure + }); + var readClient = new FlightClient(readChannel); + await ConsumeFlightLocation(readClient, endpoint.Ticket, batches).ConfigureAwait(false); + } + } + } + } + + private static async Task UploadBatches(FlightClient client, FlightDescriptor descriptor, RecordBatch[] batches) + { + using var putCall = client.StartPut(descriptor); + using var writer = putCall.RequestStream; + + try + { + var counter = 0; + foreach (var batch in batches) + { + var metadata = $"{counter}"; + + await writer.WriteAsync(batch, ByteString.CopyFromUtf8(metadata)).ConfigureAwait(false); + + // Verify server has acknowledged the write request + await putCall.ResponseStream.MoveNext().ConfigureAwait(false); + var responseString = putCall.ResponseStream.Current.ApplicationMetadata.ToStringUtf8(); + + if (responseString != metadata) + { + throw new Exception($"Response metadata '{responseString}' does not match expected metadata '{metadata}'"); + } + + counter++; + } + } + finally + { + await writer.CompleteAsync().ConfigureAwait(false); + } + + // Drain the response stream to ensure the server has stored the data + var hasMore = await putCall.ResponseStream.MoveNext().ConfigureAwait(false); + if (hasMore) + { + throw new Exception("Expected to have reached the end of the response stream"); + } + } + + private static async Task ConsumeFlightLocation(FlightClient client, FlightTicket ticket, RecordBatch[] batches) + { + using var readStream = client.GetStream(ticket); + var counter = 0; + foreach (var originalBatch in batches) + { + if (!await readStream.ResponseStream.MoveNext().ConfigureAwait(false)) + { + throw new Exception($"Expected {batches.Length} batches but received {counter}"); + } + + var batch = readStream.ResponseStream.Current; + ArrowReaderVerifier.CompareBatches(originalBatch, batch, strictCompare: false); + + counter++; + } + + if (await readStream.ResponseStream.MoveNext().ConfigureAwait(false)) + { + throw new Exception($"Expected to reach the end of the response stream after {batches.Length} batches"); + } + } +} diff --git a/csharp/test/Apache.Arrow.Flight.IntegrationTest/Startup.cs b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Startup.cs new file mode 100644 index 0000000000000..7e29d1997e63f --- /dev/null +++ b/csharp/test/Apache.Arrow.Flight.IntegrationTest/Startup.cs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using Microsoft.AspNetCore.Builder; +using Microsoft.AspNetCore.Hosting; +using Microsoft.AspNetCore.Http; +using Microsoft.Extensions.Hosting; + +namespace Apache.Arrow.Flight.IntegrationTest +{ + public class Startup + { + // This method gets called by the runtime. Use this method to configure the HTTP request pipeline. + public void Configure(IApplicationBuilder app, IWebHostEnvironment env) + { + if (env.IsDevelopment()) + { + app.UseDeveloperExceptionPage(); + } + + app.UseRouting(); + + app.UseEndpoints(endpoints => + { + endpoints.MapFlightEndpoint(); + + endpoints.MapGet("/", async context => + { + await context.Response.WriteAsync("Communication with gRPC endpoints must be made through a gRPC client. To learn how to create a client, visit: https://go.microsoft.com/fwlink/?linkid=2086909"); + }); + }); + } + } +} diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/Startup.cs b/csharp/test/Apache.Arrow.Flight.TestWeb/Startup.cs index 97c1af2f06cb8..68ce378ccd064 100644 --- a/csharp/test/Apache.Arrow.Flight.TestWeb/Startup.cs +++ b/csharp/test/Apache.Arrow.Flight.TestWeb/Startup.cs @@ -13,10 +13,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -using System; -using System.Collections.Generic; -using System.Linq; -using System.Threading.Tasks; using Microsoft.AspNetCore.Builder; using Microsoft.AspNetCore.Hosting; using Microsoft.AspNetCore.Http; diff --git a/csharp/test/Apache.Arrow.Flight.TestWeb/TestFlightServer.cs b/csharp/test/Apache.Arrow.Flight.TestWeb/TestFlightServer.cs index 4a72b73274f1e..46c5460912d8c 100644 --- a/csharp/test/Apache.Arrow.Flight.TestWeb/TestFlightServer.cs +++ b/csharp/test/Apache.Arrow.Flight.TestWeb/TestFlightServer.cs @@ -67,14 +67,16 @@ public override async Task DoPut(FlightServerRecordBatchStreamReader requestStre if(!_flightStore.Flights.TryGetValue(flightDescriptor, out var flightHolder)) { - flightHolder = new FlightHolder(flightDescriptor, await requestStream.Schema, $"http://{context.Host}"); + flightHolder = new FlightHolder(flightDescriptor, await requestStream.Schema, $"grpc+tcp://{context.Host}"); _flightStore.Flights.Add(flightDescriptor, flightHolder); } while (await requestStream.MoveNext()) { - flightHolder.AddBatch(new RecordBatchWithMetadata(requestStream.Current, requestStream.ApplicationMetadata.FirstOrDefault())); - await responseStream.WriteAsync(FlightPutResult.Empty); + var applicationMetadata = requestStream.ApplicationMetadata.FirstOrDefault(); + flightHolder.AddBatch(new RecordBatchWithMetadata(requestStream.Current, applicationMetadata)); + await responseStream.WriteAsync( + applicationMetadata == null ? FlightPutResult.Empty : new FlightPutResult(applicationMetadata)); } } diff --git a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs index 0e82673d02240..350762c992769 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs +++ b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs @@ -24,7 +24,6 @@ using Google.Protobuf; using Grpc.Core; using Grpc.Core.Utils; -using Python.Runtime; using Xunit; namespace Apache.Arrow.Flight.Tests diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 9f86d172ddbcf..b4fbbb2d41498 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -25,7 +25,7 @@ import numpy as np from .util import frombytes, tobytes, random_bytes, random_utf8 -from .util import SKIP_C_SCHEMA, SKIP_C_ARRAY +from .util import SKIP_C_SCHEMA, SKIP_C_ARRAY, SKIP_FLIGHT def metadata_key_values(pairs): @@ -1890,7 +1890,10 @@ def _temp_path(): return file_objs = [ - generate_primitive_case([], name='primitive_no_batches'), + generate_primitive_case([], name='primitive_no_batches') + # TODO(https://github.com/apache/arrow/issues/44363) + .skip_format(SKIP_FLIGHT, 'C#'), + generate_primitive_case([17, 20], name='primitive'), generate_primitive_case([0, 0, 0], name='primitive_zerolength'), @@ -1952,20 +1955,25 @@ def _temp_path(): generate_dictionary_case() # TODO(https://github.com/apache/arrow-nanoarrow/issues/622) - .skip_tester('nanoarrow'), + .skip_tester('nanoarrow') + # TODO(https://github.com/apache/arrow/issues/38045) + .skip_format(SKIP_FLIGHT, 'C#'), generate_dictionary_unsigned_case() .skip_tester('nanoarrow') - .skip_tester('Java'), # TODO(ARROW-9377) + .skip_tester('Java') # TODO(ARROW-9377) + # TODO(https://github.com/apache/arrow/issues/38045) + .skip_format(SKIP_FLIGHT, 'C#'), generate_nested_dictionary_case() # TODO(https://github.com/apache/arrow-nanoarrow/issues/622) .skip_tester('nanoarrow') - .skip_tester('Java'), # TODO(ARROW-7779) + .skip_tester('Java') # TODO(ARROW-7779) + # TODO(https://github.com/apache/arrow/issues/38045) + .skip_format(SKIP_FLIGHT, 'C#'), generate_run_end_encoded_case() .skip_tester('C#') - .skip_tester('Java') .skip_tester('JS') # TODO(https://github.com/apache/arrow-nanoarrow/issues/618) .skip_tester('nanoarrow') @@ -1988,7 +1996,9 @@ def _temp_path(): .skip_tester('nanoarrow') # TODO: ensure the extension is registered in the C++ entrypoint .skip_format(SKIP_C_SCHEMA, 'C++') - .skip_format(SKIP_C_ARRAY, 'C++'), + .skip_format(SKIP_C_ARRAY, 'C++') + # TODO(https://github.com/apache/arrow/issues/38045) + .skip_format(SKIP_FLIGHT, 'C#'), ] generated_paths = [] diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index e276738846371..5cba350253065 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -631,10 +631,13 @@ def append_tester(implementation, tester): flight_scenarios = [ Scenario( "auth:basic_proto", - description="Authenticate using the BasicAuth protobuf."), + description="Authenticate using the BasicAuth protobuf.", + skip_testers={"C#"}, + ), Scenario( "middleware", description="Ensure headers are propagated via middleware.", + skip_testers={"C#"}, ), Scenario( "ordered", @@ -666,6 +669,12 @@ def append_tester(implementation, tester): "RenewFlightEndpoint are working as expected."), skip_testers={"JS", "C#", "Rust"}, ), + Scenario( + "do_exchange:echo", + description=("Test the do_exchange method by " + "echoing data back to the client."), + skip_testers={"Go", "JS", "Rust"}, + ), Scenario( "location:reuse_connection", description="Ensure arrow-flight-reuse-connection is accepted.", @@ -689,12 +698,12 @@ def append_tester(implementation, tester): Scenario( "flight_sql", description="Ensure Flight SQL protocol is working as expected.", - skip_testers={"Rust"} + skip_testers={"Rust", "C#"} ), Scenario( "flight_sql:extension", description="Ensure Flight SQL extensions work as expected.", - skip_testers={"Rust"} + skip_testers={"Rust", "C#"} ), Scenario( "flight_sql:ingestion", diff --git a/dev/archery/archery/integration/tester_csharp.py b/dev/archery/archery/integration/tester_csharp.py index 02ced0701deaf..50b3499fbf285 100644 --- a/dev/archery/archery/integration/tester_csharp.py +++ b/dev/archery/archery/integration/tester_csharp.py @@ -17,6 +17,7 @@ from contextlib import contextmanager import os +import subprocess from . import cdata from .tester import Tester, CDataExporter, CDataImporter @@ -25,12 +26,20 @@ _ARTIFACTS_PATH = os.path.join(ARROW_ROOT_DEFAULT, "csharp/artifacts") +_BUILD_SUBDIR = "Debug/net8.0" _EXE_PATH = os.path.join(_ARTIFACTS_PATH, "Apache.Arrow.IntegrationTest", - "Debug/net8.0/Apache.Arrow.IntegrationTest", + _BUILD_SUBDIR, + "Apache.Arrow.IntegrationTest", ) +_FLIGHT_EXE_PATH = os.path.join(_ARTIFACTS_PATH, + "Apache.Arrow.Flight.IntegrationTest", + _BUILD_SUBDIR, + "Apache.Arrow.Flight.IntegrationTest", + ) + _clr_loaded = False @@ -44,10 +53,10 @@ def _load_clr(): import clr clr.AddReference( f"{_ARTIFACTS_PATH}/Apache.Arrow.IntegrationTest/" - f"Debug/net8.0/Apache.Arrow.IntegrationTest.dll") + f"{_BUILD_SUBDIR}/Apache.Arrow.IntegrationTest.dll") clr.AddReference( f"{_ARTIFACTS_PATH}/Apache.Arrow.Tests/" - f"Debug/net8.0/Apache.Arrow.Tests.dll") + f"{_BUILD_SUBDIR}/Apache.Arrow.Tests.dll") from Apache.Arrow.IntegrationTest import CDataInterface CDataInterface.Initialize() @@ -146,6 +155,8 @@ def run_gc(self): class CSharpTester(Tester): PRODUCER = True CONSUMER = True + FLIGHT_SERVER = True + FLIGHT_CLIENT = True C_DATA_SCHEMA_EXPORTER = True C_DATA_SCHEMA_IMPORTER = True C_DATA_ARRAY_EXPORTER = True @@ -192,3 +203,43 @@ def make_c_data_exporter(self): def make_c_data_importer(self): return CSharpCDataImporter(self.debug, self.args) + + def flight_request(self, port, json_path=None, scenario_name=None): + cmd = [_FLIGHT_EXE_PATH, 'client', '--port', f'{port}'] + if json_path: + cmd.extend(['--path', json_path]) + elif scenario_name: + cmd.extend(['--scenario', scenario_name]) + else: + raise TypeError("Must provide one of json_path or scenario_name") + + if self.debug: + log(' '.join(cmd)) + run_cmd(cmd) + + @contextmanager + def flight_server(self, scenario_name=None): + cmd = [_FLIGHT_EXE_PATH, 'server'] + if scenario_name: + cmd.extend(['--scenario', scenario_name]) + if self.debug: + log(' '.join(cmd)) + server = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + try: + output = server.stdout.readline().decode() + if not output.startswith("Server listening on "): + server.kill() + out, err = server.communicate() + raise RuntimeError( + '.NET Flight server did not start properly, ' + 'stdout: \n{}\n\nstderr:\n{}\n'.format( + output + out.decode(), err.decode() + ) + ) + port = int(output.split(':')[-1]) + yield port + finally: + server.kill() + server.wait(5) diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index b3eae212212ab..cc3f5b7cc5251 100755 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -90,7 +90,7 @@ if [ ${SOURCE_UPLOAD} -gt 0 ]; then ${sha512_generate} $tarball > ${tarball}.sha512 # Upload signed tarballs to GitHub Release - gh release upload ${tag} ${tarball}.sha256 ${tarball}.sha512 + gh release upload --repo apache/arrow ${tag} ${tarball}.sha256 ${tarball}.sha512 # check out the arrow RC folder svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow tmp diff --git a/dev/release/06-java-upload.sh b/dev/release/06-java-upload.sh index 9d791f8ef7c62..d0fd851da5767 100755 --- a/dev/release/06-java-upload.sh +++ b/dev/release/06-java-upload.sh @@ -107,8 +107,9 @@ for pom in *.pom; do classifiers="" args=() args+=(deploy:deploy-file) - args+=(-Durl=https://repository.apache.org/service/local/staging/deploy/maven2) args+=(-DrepositoryId=apache.releases.https) + args+=(-DretryFailedDeploymentCount=10) + args+=(-Durl=https://repository.apache.org/service/local/staging/deploy/maven2) pom="${PWD}/${pom}" args+=(-DpomFile="${pom}") if [ -f "${base}.jar" ]; then @@ -139,7 +140,7 @@ for pom in *.pom; do args+=(-Dtypes="${types}") args+=(-Dclassifiers="${classifiers}") pushd "${SOURCE_DIR}" - mvn deploy:deploy-file "${args[@]}" + mvn "${args[@]}" popd done diff --git a/dev/release/07-matlab-upload.sh b/dev/release/07-matlab-upload.sh index 803ceadb35eaf..fa3f2f1086717 100755 --- a/dev/release/07-matlab-upload.sh +++ b/dev/release/07-matlab-upload.sh @@ -1,3 +1,5 @@ +#!/usr/bin/env bash +# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb index b06b1c46b8504..4387641741d15 100644 --- a/dev/release/binary-task.rb +++ b/dev/release/binary-task.rb @@ -532,7 +532,8 @@ def with_retry(max_n_retries, target) OpenSSL::OpenSSLError, SocketError, SystemCallError, - Timeout::Error => error + Timeout::Error, + Error => error n_retries += 1 if n_retries <= max_n_retries $stderr.puts diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index e149c179813a0..dda1d36dc1aeb 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -116,6 +116,7 @@ csharp/src/Apache.Arrow.Flight.AspNetCore/Apache.Arrow.Flight.AspNetCore.csproj csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj csharp/test/Apache.Arrow.Benchmarks/Apache.Arrow.Benchmarks.csproj +csharp/test/Apache.Arrow.Flight.IntegrationTest/Apache.Arrow.Flight.IntegrationTest.csproj csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj csharp/test/Apache.Arrow.Flight.TestWeb/Apache.Arrow.Flight.TestWeb.csproj diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 4e5593525477e..d9f973562aa78 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -185,88 +185,79 @@ test_binary() { test_apt() { show_header "Testing APT packages" - for target in "debian:bookworm" \ - "arm64v8/debian:bookworm" \ - "debian:trixie" \ - "arm64v8/debian:trixie" \ - "ubuntu:focal" \ - "arm64v8/ubuntu:focal" \ - "ubuntu:jammy" \ - "arm64v8/ubuntu:jammy" \ - "ubuntu:noble" \ - "arm64v8/ubuntu:noble"; do \ - case "${target}" in - arm64v8/*) - if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then - case "${target}" in - arm64v8/ubuntu:focal) - : # OK - ;; - *) - # qemu-user-static in Ubuntu 20.04 has a crash bug: - # https://bugs.launchpad.net/qemu/+bug/1749393 - continue - ;; - esac - else - continue - fi - ;; - esac - if ! docker run --rm -v "${ARROW_DIR}":/arrow:delegated \ - --security-opt="seccomp=unconfined" \ - "${target}" \ - /arrow/dev/release/verify-apt.sh \ - "${VERSION}" \ - "rc"; then - echo "Failed to verify the APT repository for ${target}" - exit 1 - fi - done + if [ "$(arch)" = "x86_64" ]; then + for target in "debian:bookworm" \ + "debian:trixie" \ + "ubuntu:focal" \ + "ubuntu:jammy" \ + "ubuntu:noble"; do \ + if ! docker run \ + --platform=linux/x86_64 \ + --rm \ + --security-opt="seccomp=unconfined" \ + --volume "${ARROW_DIR}":/arrow:delegated \ + "${target}" \ + /arrow/dev/release/verify-apt.sh \ + "${VERSION}" \ + "rc"; then + echo "Failed to verify the APT repository for ${target} on x86_64" + exit 1 + fi + done + fi + + if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then + for target in "arm64v8/debian:bookworm" \ + "arm64v8/debian:trixie" \ + "arm64v8/ubuntu:focal" \ + "arm64v8/ubuntu:jammy" \ + "arm64v8/ubuntu:noble"; do \ + if ! docker run \ + --platform=linux/arm64 \ + --rm \ + --security-opt="seccomp=unconfined" \ + --volume "${ARROW_DIR}":/arrow:delegated \ + "${target}" \ + /arrow/dev/release/verify-apt.sh \ + "${VERSION}" \ + "rc"; then + echo "Failed to verify the APT repository for ${target} on arm64" + exit 1 + fi + done + fi } test_yum() { show_header "Testing Yum packages" - for target in "almalinux:9" \ - "arm64v8/almalinux:9" \ - "almalinux:8" \ - "arm64v8/almalinux:8" \ - "amazonlinux:2023" \ - "quay.io/centos/centos:stream9" \ - "quay.io/centos/centos:stream8" \ - "centos:7"; do - case "${target}" in - arm64v8/*) - if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then - : # OK - else - continue - fi - ;; - centos:7) - if [ "$(arch)" = "x86_64" ]; then - : # OK - else - continue - fi - ;; - esac - if ! docker run \ - --rm \ - --security-opt="seccomp=unconfined" \ - --volume "${ARROW_DIR}":/arrow:delegated \ - "${target}" \ - /arrow/dev/release/verify-yum.sh \ - "${VERSION}" \ - "rc"; then - echo "Failed to verify the Yum repository for ${target}" - exit 1 - fi - done + if [ "$(arch)" = "x86_64" ]; then + for target in "almalinux:9" \ + "almalinux:8" \ + "amazonlinux:2023" \ + "quay.io/centos/centos:stream9" \ + "quay.io/centos/centos:stream8" \ + "centos:7"; do + if ! docker run \ + --platform linux/x86_64 \ + --rm \ + --security-opt="seccomp=unconfined" \ + --volume "${ARROW_DIR}":/arrow:delegated \ + "${target}" \ + /arrow/dev/release/verify-yum.sh \ + "${VERSION}" \ + "rc"; then + echo "Failed to verify the Yum repository for ${target} on x86_64" + exit 1 + fi + done + fi - if [ "$(arch)" != "aarch64" -a -e /usr/bin/qemu-aarch64-static ]; then - for target in "quay.io/centos/centos:stream9" \ + if [ "$(arch)" = "aarch64" -o -e /usr/bin/qemu-aarch64-static ]; then + for target in "arm64v8/almalinux:9" \ + "arm64v8/almalinux:8" \ + "arm64v8/amazonlinux:2023" \ + "quay.io/centos/centos:stream9" \ "quay.io/centos/centos:stream8"; do if ! docker run \ --platform linux/arm64 \ @@ -277,7 +268,7 @@ test_yum() { /arrow/dev/release/verify-yum.sh \ "${VERSION}" \ "rc"; then - echo "Failed to verify the Yum repository for ${target} arm64" + echo "Failed to verify the Yum repository for ${target} on arm64" exit 1 fi done @@ -775,9 +766,7 @@ test_glib() { show_header "Build and test C GLib libraries" # Build and test C GLib - # We can remove '==2.80.5' once https://github.com/conda-forge/glib-feedstock/issues/191 - # is fixed. - maybe_setup_conda glib==2.80.5 gobject-introspection meson ninja ruby + maybe_setup_conda glib gobject-introspection meson ninja ruby maybe_setup_virtualenv meson # Install bundler if doesn't exist @@ -1059,6 +1048,10 @@ test_linux_wheels() { local python_versions="${TEST_PYTHON_VERSIONS:-3.9 3.10 3.11 3.12 3.13}" local platform_tags="${TEST_WHEEL_PLATFORM_TAGS:-manylinux_2_17_${arch}.manylinux2014_${arch} manylinux_2_28_${arch}}" + if [ "${SOURCE_KIND}" != "local" ]; then + local wheel_content="OFF" + fi + for python in ${python_versions}; do local pyver=${python/m} for platform in ${platform_tags}; do @@ -1068,7 +1061,8 @@ test_linux_wheels() { continue fi pip install pyarrow-${TEST_PYARROW_VERSION:-${VERSION}}-cp${pyver/.}-cp${python/.}-${platform}.whl - INSTALL_PYARROW=OFF ARROW_GCS=${check_gcs} ${ARROW_DIR}/ci/scripts/python_wheel_unix_test.sh ${ARROW_SOURCE_DIR} + CHECK_WHEEL_CONTENT=${wheel_content:-"ON"} INSTALL_PYARROW=OFF ARROW_GCS=${check_gcs} \ + ${ARROW_DIR}/ci/scripts/python_wheel_unix_test.sh ${ARROW_SOURCE_DIR} done done } @@ -1081,11 +1075,15 @@ test_macos_wheels() { # apple silicon processor if [ "$(uname -m)" = "arm64" ]; then local python_versions="3.9 3.10 3.11 3.12 3.13" - local platform_tags="macosx_11_0_arm64" + local platform_tags="macosx_12_0_arm64" local check_flight=OFF else local python_versions="3.9 3.10 3.11 3.12 3.13" - local platform_tags="macosx_10_15_x86_64" + local platform_tags="macosx_12_0_x86_64" + fi + + if [ "${SOURCE_KIND}" != "local" ]; then + local wheel_content="OFF" fi # verify arch-native wheels inside an arch-native conda environment @@ -1104,7 +1102,8 @@ test_macos_wheels() { fi pip install pyarrow-${VERSION}-cp${pyver/.}-cp${python/.}-${platform}.whl - INSTALL_PYARROW=OFF ARROW_FLIGHT=${check_flight} ARROW_GCS=${check_gcs} ARROW_S3=${check_s3} \ + CHECK_WHEEL_CONTENT=${wheel_content:-"ON"} INSTALL_PYARROW=OFF ARROW_FLIGHT=${check_flight} \ + ARROW_GCS=${check_gcs} ARROW_S3=${check_s3} \ ${ARROW_DIR}/ci/scripts/python_wheel_unix_test.sh ${ARROW_SOURCE_DIR} done done diff --git a/dev/tasks/linux-packages/github.linux.yml b/dev/tasks/linux-packages/github.linux.yml index cce976cd60e4e..263394eb2b617 100644 --- a/dev/tasks/linux-packages/github.linux.yml +++ b/dev/tasks/linux-packages/github.linux.yml @@ -83,6 +83,10 @@ jobs: APT_TARGETS: {{ target }} REPO: {{ '${{ secrets.REPO }}' }} YUM_TARGETS: {{ target }} + + {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/*/*/repositories/**/*{}") %} + {{ macros.github_upload_releases(patterns)|indent }} + - name: Set up test run: | sudo apt install -y \ @@ -123,6 +127,3 @@ jobs: APT_TARGETS: {{ target }} ARROW_VERSION: {{ arrow.version }} YUM_TARGETS: {{ target }} - - {% set patterns = upload_extensions | format_all("arrow/dev/tasks/linux-packages/*/*/repositories/**/*{}") %} - {{ macros.github_upload_releases(patterns)|indent }} diff --git a/dev/tasks/verify-rc/github.linux.amd64.docker.yml b/dev/tasks/verify-rc/github.linux.amd64.docker.yml index f2c0673314826..97eecd2d28977 100644 --- a/dev/tasks/verify-rc/github.linux.amd64.docker.yml +++ b/dev/tasks/verify-rc/github.linux.amd64.docker.yml @@ -41,6 +41,15 @@ jobs: {% if distro == 'almalinux' and target|upper == 'PYTHON' %} -e ARROW_GANDIVA=OFF \ {% endif %} + {% if distro == "conda" and target == "integration" %} + {# JPype doesn't work with Python 3.13. + # See also: + # * https://discuss.python.org/t/api-for-python-3-13-prevents-use-of-3rd-party-gc-allocators/62709/5 + # * GH-44386 + # * GH-44389 + #} + -e PYTHON_VERSION="3.12" \ + {% endif %} -e VERIFY_RC="{{ rc|default("") }}" \ -e TEST_DEFAULT=0 \ -e TEST_{{ target|upper }}=1 \ diff --git a/dev/tasks/verify-rc/github.macos.yml b/dev/tasks/verify-rc/github.macos.yml index e2bc7895c6d05..e0272e8f4e321 100644 --- a/dev/tasks/verify-rc/github.macos.yml +++ b/dev/tasks/verify-rc/github.macos.yml @@ -65,6 +65,14 @@ jobs: TEST_DEFAULT: 0 TEST_{{ target|upper }}: 1 {% if use_conda %} + {% if target == "integration" %} + # JPype doesn't work with Python 3.13. + # See also: + # * https://discuss.python.org/t/api-for-python-3-13-prevents-use-of-3rd-party-gc-allocators/62709/5 + # * GH-44386 + # * GH-44389 + PYTHON_VERSION: "3.12" + {% endif %} USE_CONDA: 1 {% endif %} run: | diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst index a7e5d0cf07e0a..a87e3810c47a0 100644 --- a/docs/source/cpp/api/array.rst +++ b/docs/source/cpp/api/array.rst @@ -19,6 +19,9 @@ Arrays ====== +Base classes +============ + .. doxygenclass:: arrow::ArrayData :project: arrow_cpp :members: @@ -85,6 +88,16 @@ Chunked Arrays :project: arrow_cpp :members: +.. doxygentypedef:: arrow::ChunkLocation + :project: arrow_cpp + +.. doxygenstruct:: arrow::TypedChunkLocation + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ChunkResolver + :project: arrow_cpp + :members: Utilities ========= diff --git a/docs/source/index.rst b/docs/source/index.rst index 3e678c78b6963..4b16f1f9db142 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,15 +20,14 @@ Apache Arrow ============ -Apache Arrow is a development platform for in-memory analytics. It contains a -set of technologies that enable big data systems to process and move data -fast. It specifies a standardized language-independent columnar memory format -for flat and hierarchical data, organized for efficient analytic operations on -modern hardware. +Apache Arrow is a universal columnar format and multi-language toolbox for fast +data interchange and in-memory analytics. -The project is developing a multi-language collection of libraries for solving -systems problems related to in-memory analytical data processing. This includes -such topics as: +The project specifies a language-independent column-oriented memory format +for flat and hierarchical data, organized for efficient analytic operations on +modern hardware. The project houses an actively developed collection of +libraries in many languages for solving problems related to data transfer and +in-memory analytical processing. This includes such topics as: * Zero-copy shared memory and RPC-based data movement * Reading and writing file formats (like CSV, Apache ORC, and Apache Parquet) diff --git a/docs/source/python/api/substrait.rst b/docs/source/python/api/substrait.rst index 1556be9dbd011..26c70216a8af2 100644 --- a/docs/source/python/api/substrait.rst +++ b/docs/source/python/api/substrait.rst @@ -43,6 +43,9 @@ compute expressions. BoundExpressions deserialize_expressions serialize_expressions + serialize_schema + deserialize_schema + SubstraitSchema Utility ------- diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst index ae9f5de127dfd..48cc67eb66720 100644 --- a/docs/source/python/api/tables.rst +++ b/docs/source/python/api/tables.rst @@ -32,6 +32,7 @@ Factory Functions concat_arrays concat_tables record_batch + concat_batches table Classes diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst index 7acff940ba2ad..337769f246ee7 100644 --- a/docs/source/python/index.rst +++ b/docs/source/python/index.rst @@ -25,8 +25,9 @@ PyArrow - Apache Arrow Python bindings This is the documentation of the Python API of Apache Arrow. -Apache Arrow is a development platform for in-memory analytics. -It contains a set of technologies that enable big data systems to store, process and move data fast. +Apache Arrow is a universal columnar format and multi-language toolbox for fast +data interchange and in-memory analytics. It contains a set of technologies that +enable data systems to efficiently store, process, and move data. See the :doc:`parent documentation <../index>` for additional details on the Arrow Project itself, on the Arrow format and the other language bindings. diff --git a/docs/source/python/integration.rst b/docs/source/python/integration.rst index 1cafc3dbded37..95c912c187d52 100644 --- a/docs/source/python/integration.rst +++ b/docs/source/python/integration.rst @@ -34,6 +34,7 @@ This allows to easily integrate PyArrow with other languages and technologies. .. toctree:: :maxdepth: 2 + integration/substrait integration/python_r integration/python_java integration/extending diff --git a/docs/source/python/integration/substrait.rst b/docs/source/python/integration/substrait.rst new file mode 100644 index 0000000000000..eaa6151e4d32e --- /dev/null +++ b/docs/source/python/integration/substrait.rst @@ -0,0 +1,249 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +========= +Substrait +========= + +The ``arrow-substrait`` module implements support for the Substrait_ format, +enabling conversion to and from Arrow objects. + +The ``arrow-dataset`` module can execute Substrait_ plans via the +:doc:`Acero <../cpp/streaming_execution>` query engine. + +.. contents:: + +Working with Schemas +==================== + +Arrow schemas can be encoded and decoded using the :meth:`pyarrow.substrait.serialize_schema` and +:meth:`pyarrow.substrait.deserialize_schema` functions. + +.. code-block:: python + + import pyarrow as pa + import pyarrow.substrait as pa_substrait + + arrow_schema = pa.schema([ + pa.field("x", pa.int32()), + pa.field("y", pa.string()) + ]) + substrait_schema = pa_substrait.serialize_schema(arrow_schema) + +The schema marshalled as a Substrait ``NamedStruct`` is directly +available as ``substrait_schema.schema``:: + + >>> print(substrait_schema.schema) + b'\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01' + +In case arrow custom types were used, the schema will require +extensions for those types to be actually usable, for this reason +the schema is also available as an `Extended Expression`_ including +all the extensions types:: + + >>> print(substrait_schema.expression) + b'"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01:\x19\x10,*\x15Acero 17.0.0' + +If ``Substrait Python`` is installed, the schema can also be converted to +a ``substrait-python`` object:: + + >>> print(substrait_schema.to_pysubstrait()) + version { + minor_number: 44 + producer: "Acero 17.0.0" + } + base_schema { + names: "x" + names: "y" + struct { + types { + i32 { + nullability: NULLABILITY_NULLABLE + } + } + types { + string { + nullability: NULLABILITY_NULLABLE + } + } + } + } + +Working with Expressions +======================== + +Arrow compute expressions can be encoded and decoded using the +:meth:`pyarrow.substrait.serialize_expressions` and +:meth:`pyarrow.substrait.deserialize_expressions` functions. + +.. code-block:: python + + import pyarrow as pa + import pyarrow.compute as pa + import pyarrow.substrait as pa_substrait + + arrow_schema = pa.schema([ + pa.field("x", pa.int32()), + pa.field("y", pa.int32()) + ]) + + substrait_expr = pa_substrait.serialize_expressions( + exprs=[pc.field("x") + pc.field("y")], + names=["total"], + schema=arrow_schema + ) + +The result of encoding to substrait an expression will be the +protobuf ``ExtendedExpression`` message data itself:: + + >>> print(bytes(substrait_expr)) + b'\nZ\x12Xhttps://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml\x12\x07\x1a\x05\x1a\x03add\x1a>\n5\x1a3\x1a\x04*\x02\x10\x01"\n\x1a\x08\x12\x06\n\x02\x12\x00"\x00"\x0c\x1a\n\x12\x08\n\x04\x12\x02\x08\x01"\x00*\x11\n\x08overflow\x12\x05ERROR\x1a\x05total"\x14\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04*\x02\x10\x01:\x19\x10,*\x15Acero 17.0.0' + +So in case a ``Substrait Python`` object is required, the expression +has to be decoded from ``substrait-python`` itself:: + + >>> import substrait + >>> pysubstrait_expr = substrait.proto.ExtendedExpression.FromString(substrait_expr) + >>> print(pysubstrait_expr) + version { + minor_number: 44 + producer: "Acero 17.0.0" + } + extension_uris { + uri: "https://github.com/substrait-io/substrait/blob/main/extensions/functions_arithmetic.yaml" + } + extensions { + extension_function { + name: "add" + } + } + referred_expr { + expression { + scalar_function { + arguments { + value { + selection { + direct_reference { + struct_field { + } + } + root_reference { + } + } + } + } + arguments { + value { + selection { + direct_reference { + struct_field { + field: 1 + } + } + root_reference { + } + } + } + } + options { + name: "overflow" + preference: "ERROR" + } + output_type { + i32 { + nullability: NULLABILITY_NULLABLE + } + } + } + } + output_names: "total" + } + base_schema { + names: "x" + names: "y" + struct { + types { + i32 { + nullability: NULLABILITY_NULLABLE + } + } + types { + i32 { + nullability: NULLABILITY_NULLABLE + } + } + } + } + +Executing Queries Using Substrait Extended Expressions +====================================================== + +Dataset supports executing queries using Substrait's `Extended Expression`_, +the expressions can be passed to the dataset scanner in the form of +:class:`pyarrow.substrait.BoundExpressions` + +.. code-block:: python + + import pyarrow.dataset as ds + import pyarrow.substrait as pa_substrait + + # Use substrait-python to create the queries + from substrait import proto + + dataset = ds.dataset("./data/index-0.parquet") + substrait_schema = pa_substrait.serialize_schema(dataset.schema).to_pysubstrait() + + # SELECT project_name FROM dataset WHERE project_name = 'pyarrow' + + projection = proto.ExtendedExpression(referred_expr=[ + {"expression": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}}, + "output_names": ["project_name"]} + ]) + projection.MergeFrom(substrait_schema) + + filtering = proto.ExtendedExpression( + extension_uris=[{"extension_uri_anchor": 99, "uri": "/functions_comparison.yaml"}], + extensions=[{"extension_function": {"extension_uri_reference": 99, "function_anchor": 199, "name": "equal:any1_any1"}}], + referred_expr=[ + {"expression": {"scalar_function": {"function_reference": 199, "arguments": [ + {"value": {"selection": {"direct_reference": {"struct_field": {"field": 0}}}}}, + {"value": {"literal": {"string": "pyarrow"}}} + ], "output_type": {"bool": {"nullability": False}}}}} + ] + ) + filtering.MergeFrom(substrait_schema) + + results = dataset.scanner( + columns=pa.substrait.BoundExpressions.from_substrait(projection), + filter=pa.substrait.BoundExpressions.from_substrait(filtering) + ).head(5) + + +.. code-block:: text + + project_name + 0 pyarrow + 1 pyarrow + 2 pyarrow + 3 pyarrow + 4 pyarrow + + +.. _`Substrait`: https://substrait.io/ +.. _`Substrait Python`: https://github.com/substrait-io/substrait-python +.. _`Acero`: https://arrow.apache.org/docs/cpp/streaming_execution.html +.. _`Extended Expression`: https://github.com/substrait-io/substrait/blob/main/site/docs/expressions/extended_expression.md diff --git a/docs/source/status.rst b/docs/source/status.rst index c838604fcaef6..5ab35f7639f56 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -208,15 +208,15 @@ Supported features in the gRPC transport: +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Flight RPC Feature | C++ | Java | Go | JS | C# | Rust | Julia | Swift | +============================================+=======+=======+=======+====+=======+=======+=======+=======+ -| All RPC methods | ✓ | ✓ | ✓ | | ✓ (1) | ✓ | | | +| All RPC methods | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ -| Authentication handlers | ✓ | ✓ | ✓ | | ✓ (2) | ✓ | | | +| Authentication handlers | ✓ | ✓ | ✓ | | ✓ (1) | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Call timeouts | ✓ | ✓ | ✓ | | | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Call cancellation | ✓ | ✓ | ✓ | | | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ -| Concurrent client calls (3) | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +| Concurrent client calls (2) | ✓ | ✓ | ✓ | | ✓ | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Custom middleware | ✓ | ✓ | ✓ | | | ✓ | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ @@ -228,7 +228,7 @@ Supported features in the UCX transport: +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Flight RPC Feature | C++ | Java | Go | JS | C# | Rust | Julia | Swift | +============================================+=======+=======+=======+====+=======+=======+=======+=======+ -| All RPC methods | ✓ (4) | | | | | | | | +| All RPC methods | ✓ (3) | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Authentication handlers | | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ @@ -236,7 +236,7 @@ Supported features in the UCX transport: +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Call cancellation | | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ -| Concurrent client calls | ✓ (5) | | | | | | | | +| Concurrent client calls | ✓ (4) | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ | Custom middleware | | | | | | | | | +--------------------------------------------+-------+-------+-------+----+-------+-------+-------+-------+ @@ -245,11 +245,10 @@ Supported features in the UCX transport: Notes: -* \(1) No support for Handshake or DoExchange. -* \(2) Support using AspNetCore authentication handlers. -* \(3) Whether a single client can support multiple concurrent calls. -* \(4) Only support for DoExchange, DoGet, DoPut, and GetFlightInfo. -* \(5) Each concurrent call is a separate connection to the server +* \(1) Support using AspNetCore authentication handlers. +* \(2) Whether a single client can support multiple concurrent calls. +* \(3) Only support for DoExchange, DoGet, DoPut, and GetFlightInfo. +* \(4) Each concurrent call is a separate connection to the server (unlike gRPC where concurrent calls are multiplexed over a single connection). This will generally provide better throughput but consumes more resources both on the server and the client. diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index 150c11e41edff..2661c12cda3af 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -187,7 +187,7 @@ public List visit(ArrowType.Union type) { @Override public List visit(ArrowType.RunEndEncoded type) { - throw new UnsupportedOperationException("Importing buffers for type: " + type); + return List.of(); } @Override diff --git a/java/c/src/main/java/org/apache/arrow/c/Format.java b/java/c/src/main/java/org/apache/arrow/c/Format.java index f77a555d18481..7ce99614d2a7a 100644 --- a/java/c/src/main/java/org/apache/arrow/c/Format.java +++ b/java/c/src/main/java/org/apache/arrow/c/Format.java @@ -233,6 +233,8 @@ static String asString(ArrowType arrowType) { return "+vl"; case LargeListView: return "+vL"; + case RunEndEncoded: + return "+r"; case NONE: throw new IllegalArgumentException("Arrow type ID is NONE"); default: @@ -321,6 +323,8 @@ static ArrowType asType(String format, long flags) return new ArrowType.ListView(); case "+vL": return new ArrowType.LargeListView(); + case "+r": + return new ArrowType.RunEndEncoded(); default: String[] parts = format.split(":", 2); if (parts.length == 2) { diff --git a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java index d8286465e475f..67ab282de5a32 100644 --- a/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java +++ b/java/c/src/test/java/org/apache/arrow/c/RoundtripTest.java @@ -88,6 +88,7 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.UnionMapWriter; @@ -770,6 +771,22 @@ public void testStructVector() { } } + @Test + public void testRunEndEncodedVector() { + try (final RunEndEncodedVector vector = RunEndEncodedVector.empty("v", allocator)) { + setVector(vector, List.of(1, 3), List.of(1, 2)); + assertTrue(roundtrip(vector, RunEndEncodedVector.class)); + } + } + + @Test + public void testEmptyRunEndEncodedVector() { + try (final RunEndEncodedVector vector = RunEndEncodedVector.empty("v", allocator)) { + setVector(vector, List.of(), List.of()); + assertTrue(roundtrip(vector, RunEndEncodedVector.class)); + } + } + @Test public void testExtensionTypeVector() { ExtensionTypeRegistry.register(new UuidType()); diff --git a/java/c/src/test/python/integration_tests.py b/java/c/src/test/python/integration_tests.py index b0a86e9c66e59..3e14be11c4644 100644 --- a/java/c/src/test/python/integration_tests.py +++ b/java/c/src/test/python/integration_tests.py @@ -399,6 +399,20 @@ def recreate_batch(): return reader.read_next_batch() self.round_trip_record_batch(recreate_batch) + + def test_runendencoded_array(self): + # empty vector + self.round_trip_array(lambda: pa.RunEndEncodedArray.from_arrays([], [], pa.run_end_encoded(pa.int64(), pa.int64()))) + + # constant null vector + self.round_trip_array(lambda: pa.RunEndEncodedArray.from_arrays([10], [None])) + # constant int vector + self.round_trip_array(lambda: pa.RunEndEncodedArray.from_arrays([10], [10])) + + # run end int vector + self.round_trip_array(lambda: pa.RunEndEncodedArray.from_arrays([3, 5, 10, 12, 19], [1, 2, 1, None, 3])) + # run end string vector + self.round_trip_array(lambda: pa.RunEndEncodedArray.from_arrays([3, 5, 10, 12, 19], ["1", "2", "1", None, "3"])) if __name__ == '__main__': unittest.main(verbosity=2) diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/DoExchangeEchoScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/DoExchangeEchoScenario.java new file mode 100644 index 0000000000000..3e7fa19a81927 --- /dev/null +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/DoExchangeEchoScenario.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.flight.integration.tests; + +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import org.apache.arrow.flight.FlightClient; +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.FlightServer; +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.Location; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Validator; + +/** Test DoExchange by echoing data back to the client. */ +final class DoExchangeEchoScenario implements Scenario { + public static final byte[] COMMAND = "echo".getBytes(StandardCharsets.UTF_8); + + @Override + public FlightProducer producer(BufferAllocator allocator, Location location) throws Exception { + return new DoExchangeProducer(allocator); + } + + @Override + public void buildServer(FlightServer.Builder builder) {} + + @Override + public void client(BufferAllocator allocator, Location location, FlightClient client) + throws Exception { + final Schema schema = + new Schema(Collections.singletonList(Field.notNullable("x", new ArrowType.Int(32, true)))); + try (final FlightClient.ExchangeReaderWriter stream = + client.doExchange(FlightDescriptor.command(COMMAND)); + final VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + final FlightStream reader = stream.getReader(); + + // Write data and check that it gets echoed back. + IntVector iv = (IntVector) root.getVector("x"); + iv.allocateNew(); + stream.getWriter().start(root); + int rowCount = 10; + for (int batchIdx = 0; batchIdx < 4; batchIdx++) { + for (int rowIdx = 0; rowIdx < rowCount; rowIdx++) { + iv.setSafe(rowIdx, batchIdx + rowIdx); + } + root.setRowCount(rowCount); + boolean writeMetadata = batchIdx % 2 == 0; + final byte[] rawMetadata = Integer.toString(batchIdx).getBytes(StandardCharsets.UTF_8); + if (writeMetadata) { + final ArrowBuf metadata = allocator.buffer(rawMetadata.length); + metadata.writeBytes(rawMetadata); + stream.getWriter().putNext(metadata); + } else { + stream.getWriter().putNext(); + } + + IntegrationAssertions.assertTrue("Unexpected end of reader", reader.next()); + if (writeMetadata) { + IntegrationAssertions.assertNotNull(reader.getLatestMetadata()); + final byte[] readMetadata = new byte[rawMetadata.length]; + reader.getLatestMetadata().readBytes(readMetadata); + IntegrationAssertions.assertEquals(rawMetadata, readMetadata); + } else { + IntegrationAssertions.assertNull(reader.getLatestMetadata()); + } + IntegrationAssertions.assertEquals(root.getSchema(), reader.getSchema()); + Validator.compareVectorSchemaRoot(reader.getRoot(), root); + } + + stream.getWriter().completed(); + IntegrationAssertions.assertFalse("Expected to reach end of reader", reader.next()); + } + } +} diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/DoExchangeProducer.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/DoExchangeProducer.java new file mode 100644 index 0000000000000..2e28ab1233e7c --- /dev/null +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/DoExchangeProducer.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.flight.integration.tests; + +import java.util.Arrays; +import org.apache.arrow.flight.CallStatus; +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.NoOpFlightProducer; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; + +/** The server used for testing the Flight do_exchange method. */ +final class DoExchangeProducer extends NoOpFlightProducer { + private final BufferAllocator allocator; + + DoExchangeProducer(BufferAllocator allocator) { + this.allocator = allocator; + } + + @Override + public void doExchange(CallContext context, FlightStream reader, ServerStreamListener writer) { + FlightDescriptor descriptor = reader.getDescriptor(); + if (descriptor.isCommand()) { + if (Arrays.equals(DoExchangeEchoScenario.COMMAND, descriptor.getCommand())) { + doEcho(reader, writer); + } + } + throw CallStatus.UNIMPLEMENTED + .withDescription("Unsupported descriptor: " + descriptor.toString()) + .toRuntimeException(); + } + + private void doEcho(FlightStream reader, ServerStreamListener writer) { + VectorSchemaRoot root = null; + VectorLoader loader = null; + while (reader.next()) { + if (reader.hasRoot()) { + if (root == null) { + root = VectorSchemaRoot.create(reader.getSchema(), allocator); + loader = new VectorLoader(root); + writer.start(root); + } + VectorUnloader unloader = new VectorUnloader(reader.getRoot()); + try (final ArrowRecordBatch arb = unloader.getRecordBatch()) { + loader.load(arb); + } + if (reader.getLatestMetadata() != null) { + reader.getLatestMetadata().getReferenceManager().retain(); + writer.putNext(reader.getLatestMetadata()); + } else { + writer.putNext(); + } + } else { + // Pure metadata + reader.getLatestMetadata().getReferenceManager().retain(); + writer.putMetadata(reader.getLatestMetadata()); + } + } + if (root != null) { + root.close(); + } + writer.completed(); + } +} diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/IntegrationAssertions.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/IntegrationAssertions.java index 92d4c73f2be87..ada565c635428 100644 --- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/IntegrationAssertions.java +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/IntegrationAssertions.java @@ -78,6 +78,12 @@ static void assertTrue(String message, boolean value) { } } + static void assertNull(Object actual) { + if (actual != null) { + throw new AssertionError("Expected: null\n\nbut got: " + actual); + } + } + static void assertNotNull(Object actual) { if (actual == null) { throw new AssertionError("Expected: (not null)\n\nbut got: null\n"); diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java index 451edb6bd5a34..7903ae994c7d1 100644 --- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/Scenarios.java @@ -51,6 +51,7 @@ private Scenarios() { scenarios.put("flight_sql:ingestion", FlightSqlIngestionScenario::new); scenarios.put("app_metadata_flight_info_endpoint", AppMetadataFlightInfoEndpointScenario::new); scenarios.put("session_options", SessionOptionsScenario::new); + scenarios.put("do_exchange:echo", DoExchangeEchoScenario::new); } private static Scenarios getInstance() { diff --git a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java index 8419432c66227..16265b8b37014 100644 --- a/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java +++ b/java/flight/flight-integration-tests/src/test/java/org/apache/arrow/flight/integration/tests/IntegrationTest.java @@ -99,6 +99,11 @@ void sessionOptions() throws Exception { testScenario("session_options"); } + @Test + void doExchangeEcho() throws Exception { + testScenario("do_exchange:echo"); + } + void testScenario(String scenarioName) throws Exception { TestBufferAllocationListener listener = new TestBufferAllocationListener(); try (final BufferAllocator allocator = new RootAllocator(listener, Long.MAX_VALUE)) { diff --git a/java/pom.xml b/java/pom.xml index c9560879767e7..876ce703f0c16 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -93,14 +93,14 @@ under the License. ${project.build.directory}/generated-sources 1.9.0 - 5.11.2 + 5.11.3 2.0.16 33.3.1-jre 4.1.114.Final 1.65.0 3.25.4 2.18.0 - 3.4.0 + 3.4.1 24.3.25 1.12.0 @@ -110,8 +110,8 @@ under the License. 2.31.0 5.11.0 5.2.0 - 3.48.0 - 1.5.8 + 3.48.1 + 1.5.11 none -Xdoclint:none @@ -279,7 +279,7 @@ under the License. org.mockito mockito-junit-jupiter - 5.14.1 + 5.14.2 test @@ -530,7 +530,7 @@ under the License. org.cyclonedx cyclonedx-maven-plugin - 2.8.2 + 2.9.0 org.apache.drill.tools diff --git a/java/vector/src/main/codegen/templates/UnionMapWriter.java b/java/vector/src/main/codegen/templates/UnionMapWriter.java index 606f880377be7..90b55cb65e6ef 100644 --- a/java/vector/src/main/codegen/templates/UnionMapWriter.java +++ b/java/vector/src/main/codegen/templates/UnionMapWriter.java @@ -219,4 +219,16 @@ public MapWriter map(boolean keysSorted) { return super.map(); } } + + @Override + public MapWriter map() { + switch (mode) { + case KEY: + return entryWriter.map(MapVector.KEY_NAME); + case VALUE: + return entryWriter.map(MapVector.VALUE_NAME); + default: + return super.map(); + } + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java index e8de86f6e9549..1bb9a3d6c05f3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RunEndEncodedVector.java @@ -30,8 +30,11 @@ import org.apache.arrow.memory.util.hash.ArrowBufHasher; import org.apache.arrow.vector.BaseIntVector; import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BufferBacked; import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.compare.VectorVisitor; @@ -50,6 +53,7 @@ * values vector of any type. There are no buffers associated with the parent vector. */ public class RunEndEncodedVector extends BaseValueVector implements FieldVector { + public static final FieldVector DEFAULT_VALUE_VECTOR = ZeroVector.INSTANCE; public static final FieldVector DEFAULT_RUN_END_VECTOR = ZeroVector.INSTANCE; @@ -203,6 +207,7 @@ public void clear() { for (FieldVector v : getChildrenFromFields()) { v.clear(); } + this.valueCount = 0; } /** @@ -234,19 +239,6 @@ public MinorType getMinorType() { return MinorType.RUNENDENCODED; } - /** - * To transfer quota responsibility. - * - * @param allocator the target allocator - * @return a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new - * target vector of the same type. - */ - @Override - public TransferPair getTransferPair(BufferAllocator allocator) { - throw new UnsupportedOperationException( - "RunEndEncodedVector does not support getTransferPair(BufferAllocator)"); - } - /** * To transfer quota responsibility. * @@ -284,8 +276,7 @@ public TransferPair getTransferPair(Field field, BufferAllocator allocator) { */ @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { - throw new UnsupportedOperationException( - "RunEndEncodedVector does not support getTransferPair(String, BufferAllocator, CallBack)"); + return new TransferImpl(ref, allocator, callBack); } /** @@ -299,8 +290,7 @@ public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallB */ @Override public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { - throw new UnsupportedOperationException( - "RunEndEncodedVector does not support getTransferPair(Field, BufferAllocator, CallBack)"); + return new TransferImpl(field, allocator, callBack); } /** @@ -312,8 +302,156 @@ public TransferPair getTransferPair(Field field, BufferAllocator allocator, Call */ @Override public TransferPair makeTransferPair(ValueVector target) { - throw new UnsupportedOperationException( - "RunEndEncodedVector does not support makeTransferPair(ValueVector)"); + return new TransferImpl((RunEndEncodedVector) target); + } + + private class TransferImpl implements TransferPair { + + RunEndEncodedVector to; + TransferPair dataTransferPair; + TransferPair reeTransferPair; + + public TransferImpl(String name, BufferAllocator allocator, CallBack callBack) { + this(new RunEndEncodedVector(name, allocator, field.getFieldType(), callBack)); + } + + public TransferImpl(Field field, BufferAllocator allocator, CallBack callBack) { + this(new RunEndEncodedVector(field, allocator, callBack)); + } + + public TransferImpl(RunEndEncodedVector to) { + this.to = to; + if (to.getRunEndsVector() instanceof ZeroVector) { + to.initializeChildrenFromFields(field.getChildren()); + } + reeTransferPair = getRunEndsVector().makeTransferPair(to.getRunEndsVector()); + dataTransferPair = getValuesVector().makeTransferPair(to.getValuesVector()); + } + + /** + * Transfer the vector data to another vector. The memory associated with this vector is + * transferred to the allocator of target vector for accounting and management purposes. + */ + @Override + public void transfer() { + to.clear(); + dataTransferPair.transfer(); + reeTransferPair.transfer(); + if (valueCount > 0) { + to.setValueCount(valueCount); + } + clear(); + } + + /** + * Slice this vector at the desired index and length, then transfer the corresponding data to + * the target vector. + * + * @param startIndex start position of the split in source vector. + * @param length length of the split. + */ + @Override + public void splitAndTransfer(int startIndex, int length) { + to.clear(); + if (length <= 0) { + return; + } + + int physicalStartIndex = getPhysicalIndex(startIndex); + int physicalEndIndex = getPhysicalIndex(startIndex + length - 1); + int physicalLength = physicalEndIndex - physicalStartIndex + 1; + dataTransferPair.splitAndTransfer(physicalStartIndex, physicalLength); + FieldVector toRunEndsVector = to.runEndsVector; + if (startIndex == 0) { + if (((BaseIntVector) runEndsVector).getValueAsLong(physicalEndIndex) == length) { + reeTransferPair.splitAndTransfer(physicalStartIndex, physicalLength); + } else { + reeTransferPair.splitAndTransfer(physicalStartIndex, physicalLength - 1); + toRunEndsVector.setValueCount(physicalLength); + if (toRunEndsVector instanceof SmallIntVector) { + ((SmallIntVector) toRunEndsVector).set(physicalEndIndex, length); + } else if (toRunEndsVector instanceof IntVector) { + ((IntVector) toRunEndsVector).set(physicalEndIndex, length); + } else if (toRunEndsVector instanceof BigIntVector) { + ((BigIntVector) toRunEndsVector).set(physicalEndIndex, length); + } else { + throw new IllegalArgumentException( + "Run-end vector and must be of type int with size 16, 32, or 64 bits."); + } + } + } else { + shiftRunEndsVector( + toRunEndsVector, + startIndex, + length, + physicalStartIndex, + physicalEndIndex, + physicalLength); + } + getTo().setValueCount(length); + } + + private void shiftRunEndsVector( + ValueVector toRunEndVector, + int startIndex, + int length, + int physicalStartIndex, + int physicalEndIndex, + int physicalLength) { + toRunEndVector.setValueCount(physicalLength); + toRunEndVector.getValidityBuffer().setOne(0, toRunEndVector.getValidityBuffer().capacity()); + ArrowBuf fromRunEndBuffer = runEndsVector.getDataBuffer(); + ArrowBuf toRunEndBuffer = toRunEndVector.getDataBuffer(); + int physicalLastIndex = physicalLength - 1; + if (toRunEndVector instanceof SmallIntVector) { + byte typeWidth = SmallIntVector.TYPE_WIDTH; + for (int i = 0; i < physicalLastIndex; i++) { + toRunEndBuffer.setShort( + (long) i * typeWidth, + fromRunEndBuffer.getShort((long) (i + physicalStartIndex) * typeWidth) - startIndex); + } + int lastEnd = + Math.min( + fromRunEndBuffer.getShort((long) physicalEndIndex * typeWidth) - startIndex, + length); + toRunEndBuffer.setShort((long) physicalLastIndex * typeWidth, lastEnd); + } else if (toRunEndVector instanceof IntVector) { + byte typeWidth = IntVector.TYPE_WIDTH; + for (int i = 0; i < physicalLastIndex; i++) { + toRunEndBuffer.setInt( + (long) i * typeWidth, + fromRunEndBuffer.getInt((long) (i + physicalStartIndex) * typeWidth) - startIndex); + } + int lastEnd = + Math.min( + fromRunEndBuffer.getInt((long) physicalEndIndex * typeWidth) - startIndex, length); + toRunEndBuffer.setInt((long) physicalLastIndex * typeWidth, lastEnd); + } else if (toRunEndVector instanceof BigIntVector) { + byte typeWidth = BigIntVector.TYPE_WIDTH; + for (int i = 0; i < physicalLastIndex; i++) { + toRunEndBuffer.setLong( + (long) i * typeWidth, + fromRunEndBuffer.getLong((long) (i + physicalStartIndex) * typeWidth) - startIndex); + } + long lastEnd = + Math.min( + fromRunEndBuffer.getLong((long) physicalEndIndex * typeWidth) - startIndex, length); + toRunEndBuffer.setLong((long) physicalLastIndex * typeWidth, lastEnd); + } else { + throw new IllegalArgumentException( + "Run-end vector and must be of type int with size 16, 32, or 64 bits."); + } + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, RunEndEncodedVector.this); + } } /** @@ -568,6 +706,7 @@ public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers throw new UnsupportedOperationException( "Run-end encoded vectors do not have any associated buffers."); } + this.valueCount = fieldNode.getLength(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java index 5668325a87eeb..fe0803d2984cb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java @@ -909,10 +909,12 @@ private void readFromJsonIntoVector(Field field, FieldVector vector) throws IOEx variadicBufferIndices)); } - int nullCount = 0; - if (type instanceof ArrowType.Null) { + int nullCount; + if (type instanceof ArrowType.RunEndEncoded || type instanceof Union) { + nullCount = 0; + } else if (type instanceof ArrowType.Null) { nullCount = valueCount; - } else if (!(type instanceof Union)) { + } else { nullCount = BitVectorHelper.getNullCount(vectorBuffers.get(0), valueCount); } final ArrowFieldNode fieldNode = new ArrowFieldNode(valueCount, nullCount); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index ef31b4f837344..5c7215437f8ec 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -305,7 +305,7 @@ public Void visit(RunEndEncodedVector vector, Void value) { if (runCount == 0) { validateOrThrow(valueCount == 0, "Run end vector does not contain enough elements"); } else if (runCount > 0) { - double lastEnd = ((BaseIntVector) runEndsVector).getValueAsLong(runCount - 1); + long lastEnd = ((BaseIntVector) runEndsVector).getValueAsLong(runCount - 1); validateOrThrow( valueCount == lastEnd, "Vector logic length not equal to the last end in run ends vector. Logical length %s, last end %s", diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java index 213ffced273a0..a4197c50b5bff 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestMapVector.java @@ -640,11 +640,12 @@ public void testMapWithMapValue() throws Exception { MapWriter valueWriter; // we are essentially writing Map> - // populate map vector with the following three records + // populate map vector with the following four records // [ // null, // [1:[50: 100, 200:400], 2:[75: 175, 150: 250]], - // [3:[10: 20], 4:[15: 20], 5:[25: 30, 35: null]] + // [3:[10: 20], 4:[15: 20], 5:[25: 30, 35: null]], + // [8:[15: 30, 10: 20]] // ] /* write null at index 0 */ @@ -706,11 +707,26 @@ public void testMapWithMapValue() throws Exception { mapWriter.endMap(); - assertEquals(2, mapVector.getLastSet()); + /* write one or more maps at index 3 */ + mapWriter.setPosition(3); + mapWriter.startMap(); + + mapWriter.startEntry(); + mapWriter.key().bigInt().writeBigInt(8); + valueWriter = mapWriter.value().map(); + valueWriter.startMap(); + writeEntry(valueWriter, 15, 30L); + writeEntry(valueWriter, 10, 20L); + valueWriter.endMap(); + mapWriter.endEntry(); + + mapWriter.endMap(); + + assertEquals(3, mapVector.getLastSet()); - mapWriter.setValueCount(3); + mapWriter.setValueCount(4); - assertEquals(3, mapVector.getValueCount()); + assertEquals(4, mapVector.getValueCount()); // Get mapVector element at index 0 Object result = mapVector.getObject(0); @@ -784,19 +800,40 @@ public void testMapWithMapValue() throws Exception { assertEquals(35L, getResultKey(innerMap)); assertNull(innerMap.get(MapVector.VALUE_NAME)); + // Get mapVector element at index 3 + result = mapVector.getObject(3); + resultSet = (ArrayList) result; + + // only 1 map entry at index 3 + assertEquals(1, resultSet.size()); + + resultStruct = (Map) resultSet.get(0); + assertEquals(8L, getResultKey(resultStruct)); + list = (ArrayList>) getResultValue(resultStruct); + assertEquals(2, list.size()); // value is a list of 2 maps + innerMap = list.get(0); + assertEquals(15L, getResultKey(innerMap)); + assertEquals(30L, getResultValue(innerMap)); + innerMap = list.get(1); + assertEquals(10L, getResultKey(innerMap)); + assertEquals(20L, getResultValue(innerMap)); + /* check underlying bitVector */ assertTrue(mapVector.isNull(0)); assertFalse(mapVector.isNull(1)); assertFalse(mapVector.isNull(2)); + assertFalse(mapVector.isNull(3)); /* check underlying offsets */ final ArrowBuf offsetBuffer = mapVector.getOffsetBuffer(); - /* mapVector has 0 entries at index 0, 2 entries at index 1, and 3 entries at index 2 */ + // mapVector has 0 entries at index 0, 2 entries at index 1, 3 entries at index 2, + // and 1 entry at index 3 assertEquals(0, offsetBuffer.getInt(0 * MapVector.OFFSET_WIDTH)); assertEquals(0, offsetBuffer.getInt(1 * MapVector.OFFSET_WIDTH)); assertEquals(2, offsetBuffer.getInt(2 * MapVector.OFFSET_WIDTH)); assertEquals(5, offsetBuffer.getInt(3 * MapVector.OFFSET_WIDTH)); + assertEquals(6, offsetBuffer.getInt(4 * MapVector.OFFSET_WIDTH)); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java index 3f4be2e52ce56..adf51c07301f3 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestRunEndEncodedVector.java @@ -32,6 +32,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.TransferPair; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -106,25 +107,28 @@ public void testBasicRunEndEncodedVector() { setBasicVector(reeVector, runCount, i -> i % 2 == 0 ? null : i + 1, i -> i + 1); assertEquals(15, reeVector.getValueCount()); - int index = 0; - for (int run = 0; run < runCount; run++) { - long expectedRunValue = (long) run + 1; - for (int j = 0; j <= run; j++) { - if (run % 2 == 0) { - assertNull(reeVector.getObject(index)); - } else { - assertEquals(expectedRunValue, reeVector.getObject(index)); - } - index++; - } - } - + checkBasic(runCount, reeVector); // test index out of bound assertThrows(IndexOutOfBoundsException.class, () -> reeVector.getObject(-1)); assertThrows(IndexOutOfBoundsException.class, () -> reeVector.getObject(logicalValueCount)); } } + private static void checkBasic(int runCount, RunEndEncodedVector reeVector) { + int index = 0; + for (int run = 0; run < runCount; run++) { + long expectedRunValue = (long) run + 1; + for (int j = 0; j <= run; j++) { + if (run % 2 == 0) { + assertNull(reeVector.getObject(index)); + } else { + assertEquals(expectedRunValue, reeVector.getObject(index)); + } + index++; + } + } + } + @Test public void testRangeCompare() { // test compare same constant vector @@ -228,4 +232,102 @@ private static int setBasicVector( reeVector.setValueCount(logicalValueCount); return logicalValueCount; } + + @Test + public void testTransfer() { + // constant vector + try (RunEndEncodedVector reeVector = + new RunEndEncodedVector(createBigIntRunEndEncodedField("constant"), allocator, null)) { + Long value = 65536L; + int logicalValueCount = 100; + setConstantVector(reeVector, value, logicalValueCount); + assertEquals(logicalValueCount, reeVector.getValueCount()); + for (int i = 0; i < logicalValueCount; i++) { + assertEquals(value, reeVector.getObject(i)); + } + + TransferPair transferPair = reeVector.getTransferPair(allocator); + transferPair.transfer(); + assertEquals(0, reeVector.getValueCount()); + assertEquals(0, reeVector.getValuesVector().getValueCount()); + assertEquals(0, reeVector.getRunEndsVector().getValueCount()); + try (RunEndEncodedVector toVector = (RunEndEncodedVector) transferPair.getTo()) { + assertEquals(logicalValueCount, toVector.getValueCount()); + for (int i = 0; i < logicalValueCount; i++) { + assertEquals(value, toVector.getObject(i)); + } + } + } + + // basic run end encoded vector + try (RunEndEncodedVector reeVector = + new RunEndEncodedVector(createBigIntRunEndEncodedField("basic"), allocator, null)) { + // Create REE vector representing: + // [null, 2, 2, null, null, null, 4, 4, 4, 4, null, null, null, null, null]. + int runCount = 5; + final int logicalValueCount = + setBasicVector(reeVector, runCount, i -> i % 2 == 0 ? null : i + 1, i -> i + 1); + + assertEquals(15, reeVector.getValueCount()); + checkBasic(runCount, reeVector); + + TransferPair transferPair = reeVector.getTransferPair(allocator); + transferPair.transfer(); + assertEquals(0, reeVector.getValueCount()); + assertEquals(0, reeVector.getValuesVector().getValueCount()); + assertEquals(0, reeVector.getRunEndsVector().getValueCount()); + try (RunEndEncodedVector toVector = (RunEndEncodedVector) transferPair.getTo()) { + assertEquals(logicalValueCount, toVector.getValueCount()); + checkBasic(runCount, toVector); + } + } + } + + @Test + public void testSplitAndTransfer() { + // test compare same constant vector + try (RunEndEncodedVector constantVector = + new RunEndEncodedVector(createBigIntRunEndEncodedField("constant"), allocator, null)) { + int logicalValueCount = 15; + + setConstantVector(constantVector, 1L, logicalValueCount); + + try (RunEndEncodedVector toVector = RunEndEncodedVector.empty("constant", allocator)) { + TransferPair transferPair = constantVector.makeTransferPair(toVector); + int startIndex = 1; + int transferLength = 10; + transferPair.splitAndTransfer(startIndex, transferLength); + + toVector.validate(); + assertEquals(transferLength, toVector.getValueCount()); + assertTrue( + constantVector.accept( + new RangeEqualsVisitor(constantVector, toVector), new Range(1, 0, transferLength))); + } + } + + try (RunEndEncodedVector reeVector = + new RunEndEncodedVector(createBigIntRunEndEncodedField("ree"), allocator, null)) { + + setBasicVector(reeVector, 5, i -> i + 1, i -> i + 1); + + int[][] transferConfigs = {{0, 0}, {0, 1}, {0, 9}, {1, 0}, {1, 10}, {1, 14}}; + + try (RunEndEncodedVector toVector = RunEndEncodedVector.empty("ree", allocator)) { + TransferPair transferPair = reeVector.makeTransferPair(toVector); + for (final int[] transferConfig : transferConfigs) { + int startIndex = transferConfig[0]; + int transferLength = transferConfig[1]; + transferPair.splitAndTransfer(startIndex, transferLength); + + toVector.validate(); + assertEquals(transferLength, toVector.getValueCount()); + assertTrue( + reeVector.accept( + new RangeEqualsVisitor(reeVector, toVector), + new Range(startIndex, 0, transferLength))); + } + } + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index afbc30f019ef6..f599dfa539421 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -68,10 +68,12 @@ import org.apache.arrow.vector.complex.LargeListViewVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.ListViewVector; +import org.apache.arrow.vector.complex.RunEndEncodedVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.holders.IntervalDayHolder; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; /** Utility for populating {@link org.apache.arrow.vector.ValueVector}. */ @@ -794,4 +796,41 @@ public static void setVector(LargeListViewVector vector, List... values dataVector.setValueCount(curPos); vector.setValueCount(values.length); } + + public static void setVector( + RunEndEncodedVector vector, List runEnds, List values) { + int runCount = runEnds.size(); + assert runCount == values.size(); + final FieldType valueType = FieldType.notNullable(MinorType.INT.getType()); + final FieldType runEndType = FieldType.notNullable(Types.MinorType.INT.getType()); + final Field valueField = new Field("value", valueType, null); + final Field runEndField = new Field("ree", runEndType, null); + vector.initializeChildrenFromFields(List.of(runEndField, valueField)); + + IntVector runEndsVector = (IntVector) vector.getRunEndsVector(); + runEndsVector.setValueCount(runCount); + for (int i = 0; i < runCount; i++) { + if (runEnds.get(i) == null) { + runEndsVector.setNull(i); + } else { + runEndsVector.set(i, runEnds.get(i)); + } + } + + IntVector valuesVector = (IntVector) vector.getValuesVector(); + valuesVector.setValueCount(runCount); + for (int i = 0; i < runCount; i++) { + if (runEnds.get(i) == null) { + valuesVector.setNull(i); + } else { + valuesVector.set(i, values.get(i)); + } + } + + if (runCount > 0) { + vector.setValueCount(runEnds.get(runCount - 1)); + } else { + vector.setValueCount(0); + } + } } diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index d31c93119b73a..8c8c09265d0bf 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -172,7 +172,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - bool8, fixed_shape_tensor, opaque, uuid, + bool8, fixed_shape_tensor, json_, opaque, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -183,7 +183,7 @@ def print_entry(label, value): FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, Bool8Type, FixedShapeTensorType, - OpaqueType, UuidType, + JsonType, OpaqueType, UuidType, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -218,7 +218,7 @@ def print_entry(label, value): MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, - OpaqueArray, UuidArray, + JsonArray, OpaqueArray, UuidArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -236,7 +236,7 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar, UuidScalar) + FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, @@ -267,7 +267,7 @@ def print_entry(label, value): from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, concat_arrays, concat_tables, TableGroupBy, - RecordBatchReader) + RecordBatchReader, concat_batches) # Exceptions from pyarrow.lib import (ArrowCancelled, diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index d39120934d5fd..658f6b6cac4b5 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2441,7 +2441,7 @@ cdef class Expression(_Weakrefable): ) @staticmethod - def from_substrait(object buffer not None): + def from_substrait(object message not None): """ Deserialize an expression from Substrait @@ -2453,7 +2453,7 @@ cdef class Expression(_Weakrefable): Parameters ---------- - buffer : bytes or Buffer + message : bytes or Buffer or a protobuf Message The Substrait message to deserialize Returns @@ -2461,7 +2461,7 @@ cdef class Expression(_Weakrefable): Expression The deserialized expression """ - expressions = _pas().deserialize_expressions(buffer).expressions + expressions = _pas().BoundExpressions.from_substrait(message).expressions if len(expressions) == 0: raise ValueError("Substrait message did not contain any expressions") if len(expressions) > 1: diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 6b5259f499f05..39e3f4d665d88 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -39,6 +39,11 @@ from pyarrow.util import _is_iterable, _is_path_like, _stringify_path from pyarrow._json cimport ParseOptions as JsonParseOptions from pyarrow._json cimport ReadOptions as JsonReadOptions +try: + import pyarrow.substrait as pa_substrait +except ImportError: + pa_substrait = None + _DEFAULT_BATCH_SIZE = 2**17 _DEFAULT_BATCH_READAHEAD = 16 @@ -272,6 +277,13 @@ cdef class Dataset(_Weakrefable): # at the moment only support filter requested_filter = options.get("filter") + if pa_substrait and isinstance(requested_filter, pa_substrait.BoundExpressions): + expressions = list(requested_filter.expressions.values()) + if len(expressions) != 1: + raise ValueError( + "Only one BoundExpressions with a single expression are supported") + new_options["filter"] = requested_filter = expressions[0] + current_filter = self._scan_options.get("filter") if requested_filter is not None and current_filter is not None: new_options["filter"] = current_filter & requested_filter @@ -282,7 +294,7 @@ cdef class Dataset(_Weakrefable): def scanner(self, object columns=None, - Expression filter=None, + object filter=None, int batch_size=_DEFAULT_BATCH_SIZE, int batch_readahead=_DEFAULT_BATCH_READAHEAD, int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, @@ -3447,6 +3459,9 @@ cdef void _populate_builder(const shared_ptr[CScannerBuilder]& ptr, filter, pyarrow_wrap_schema(builder.schema())))) if columns is not None: + if pa_substrait and isinstance(columns, pa_substrait.BoundExpressions): + columns = columns.expressions + if isinstance(columns, dict): for expr in columns.values(): if not isinstance(expr, Expression): @@ -3527,7 +3542,7 @@ cdef class Scanner(_Weakrefable): @staticmethod def from_dataset(Dataset dataset not None, *, object columns=None, - Expression filter=None, + object filter=None, int batch_size=_DEFAULT_BATCH_SIZE, int batch_readahead=_DEFAULT_BATCH_READAHEAD, int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD, diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 067cb5f91681b..d9359c8e77d00 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -26,6 +26,13 @@ from pyarrow.lib cimport * from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_substrait cimport * +try: + import substrait as py_substrait +except ImportError: + py_substrait = None +else: + import substrait.proto # no-cython-lint + # TODO GH-37235: Fix exception handling cdef CDeclaration _create_named_table_provider( @@ -133,7 +140,7 @@ def run_query(plan, *, table_provider=None, use_threads=True): c_bool c_use_threads c_use_threads = use_threads - if isinstance(plan, bytes): + if isinstance(plan, (bytes, memoryview)): c_buf_plan = pyarrow_unwrap_buffer(py_buffer(plan)) elif isinstance(plan, Buffer): c_buf_plan = pyarrow_unwrap_buffer(plan) @@ -187,6 +194,105 @@ def _parse_json_plan(plan): return pyarrow_wrap_buffer(c_buf_plan) +class SubstraitSchema: + """A Schema encoded for Substrait usage. + + The SubstraitSchema contains a schema represented + both as a substrait ``NamedStruct`` and as an + ``ExtendedExpression``. + + The ``ExtendedExpression`` is available for cases where types + used by the schema require extensions to decode them. + In such case the schema will be the ``base_schema`` of the + ``ExtendedExpression`` and all extensions will be provided. + """ + + def __init__(self, schema, expression): + self.schema = schema + self.expression = expression + + def to_pysubstrait(self): + """Convert the schema to a substrait-python ExtendedExpression object.""" + if py_substrait is None: + raise ImportError("The 'substrait' package is required.") + return py_substrait.proto.ExtendedExpression.FromString(self.expression) + + +def serialize_schema(schema): + """ + Serialize a schema into a SubstraitSchema object. + + Parameters + ---------- + schema : Schema + The schema to serialize + + Returns + ------- + SubstraitSchema + The schema stored in a SubstraitSchema object. + """ + return SubstraitSchema( + schema=_serialize_namedstruct_schema(schema), + expression=serialize_expressions([], [], schema, allow_arrow_extensions=True) + ) + + +def _serialize_namedstruct_schema(schema): + cdef: + CResult[shared_ptr[CBuffer]] c_res_buffer + shared_ptr[CBuffer] c_buffer + CConversionOptions c_conversion_options + CExtensionSet c_extensions + + with nogil: + c_res_buffer = SerializeSchema(deref(( schema).sp_schema), &c_extensions, c_conversion_options) + c_buffer = GetResultValue(c_res_buffer) + + return memoryview(pyarrow_wrap_buffer(c_buffer)) + + +def deserialize_schema(buf): + """ + Deserialize a ``NamedStruct`` Substrait message + or a SubstraitSchema object into an Arrow Schema object + + Parameters + ---------- + buf : Buffer or bytes or SubstraitSchema + The message to deserialize + + Returns + ------- + Schema + The deserialized schema + """ + cdef: + shared_ptr[CBuffer] c_buffer + CResult[shared_ptr[CSchema]] c_res_schema + shared_ptr[CSchema] c_schema + CConversionOptions c_conversion_options + CExtensionSet c_extensions + + if isinstance(buf, SubstraitSchema): + return deserialize_expressions(buf.expression).schema + + if isinstance(buf, (bytes, memoryview)): + c_buffer = pyarrow_unwrap_buffer(py_buffer(buf)) + elif isinstance(buf, Buffer): + c_buffer = pyarrow_unwrap_buffer(buf) + else: + raise TypeError( + f"Expected 'pyarrow.Buffer' or bytes, got '{type(buf)}'") + + with nogil: + c_res_schema = DeserializeSchema( + deref(c_buffer), c_extensions, c_conversion_options) + c_schema = GetResultValue(c_res_schema) + + return pyarrow_wrap_schema(c_schema) + + def serialize_expressions(exprs, names, schema, *, allow_arrow_extensions=False): """ Serialize a collection of expressions into Substrait @@ -245,7 +351,7 @@ def serialize_expressions(exprs, names, schema, *, allow_arrow_extensions=False) with nogil: c_res_buffer = SerializeExpressions(c_bound_exprs, c_conversion_options) c_buffer = GetResultValue(c_res_buffer) - return pyarrow_wrap_buffer(c_buffer) + return memoryview(pyarrow_wrap_buffer(c_buffer)) cdef class BoundExpressions(_Weakrefable): @@ -290,6 +396,32 @@ cdef class BoundExpressions(_Weakrefable): self.init(bound_expressions) return self + @classmethod + def from_substrait(cls, message): + """ + Convert a Substrait message into a BoundExpressions object + + Parameters + ---------- + message : Buffer or bytes or protobuf Message + The message to convert to a BoundExpressions object + + Returns + ------- + BoundExpressions + The converted expressions, their names, and the bound schema + """ + if isinstance(message, (bytes, memoryview)): + return deserialize_expressions(message) + elif isinstance(message, Buffer): + return deserialize_expressions(message) + else: + try: + return deserialize_expressions(message.SerializeToString()) + except AttributeError: + raise TypeError( + f"Expected 'pyarrow.Buffer' or bytes or protobuf Message, got '{type(message)}'") + def deserialize_expressions(buf): """ @@ -310,7 +442,7 @@ def deserialize_expressions(buf): CResult[CBoundExpressions] c_res_bound_exprs CBoundExpressions c_bound_exprs - if isinstance(buf, bytes): + if isinstance(buf, (bytes, memoryview)): c_buffer = pyarrow_unwrap_buffer(py_buffer(buf)) elif isinstance(buf, Buffer): c_buffer = pyarrow_unwrap_buffer(buf) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ae9e7fd777ed1..eaedbf1e38580 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4344,6 +4344,33 @@ cdef class ExtensionArray(Array): return result +class JsonArray(ExtensionArray): + """ + Concrete class for Arrow arrays of JSON data type. + + This does not guarantee that the JSON data actually + is valid JSON. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + + class UuidArray(ExtensionArray): """ Concrete class for Arrow arrays of UUID data type. diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 044dd0333f323..9297436c1cf8c 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -173,3 +173,12 @@ cdef inline object PyObject_to_object(PyObject* o): cdef object result = o cpython.Py_DECREF(result) return result + + +cdef extern from "" namespace "std" nogil: + cdef cppclass cpp_string_view "std::string_view": + string_view() + string_view(const char*) + size_t size() + bint empty() + const char* data() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8e6922a912a32..a70cb91873e45 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1356,6 +1356,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: CConcatenateTablesOptions options, CMemoryPool* memory_pool) + CResult[shared_ptr[CRecordBatch]] ConcatenateRecordBatches( + const vector[shared_ptr[CRecordBatch]]& batches, + CMemoryPool* memory_pool) + cdef cppclass CDictionaryUnifier" arrow::DictionaryUnifier": @staticmethod CResult[shared_ptr[CChunkedArray]] UnifyChunkedArray( @@ -2867,6 +2871,13 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil: + cdef cppclass CJsonType" arrow::extension::JsonExtensionType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& storage_type) + + cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): diff --git a/python/pyarrow/includes/libarrow_substrait.pxd b/python/pyarrow/includes/libarrow_substrait.pxd index c41f4c05d3a77..865568e2ba6f1 100644 --- a/python/pyarrow/includes/libarrow_substrait.pxd +++ b/python/pyarrow/includes/libarrow_substrait.pxd @@ -45,6 +45,20 @@ cdef extern from "arrow/engine/substrait/options.h" namespace "arrow::engine" no cdef extern from "arrow/engine/substrait/extension_set.h" \ namespace "arrow::engine" nogil: + cdef struct CSubstraitId "arrow::engine::Id": + cpp_string_view uri + cpp_string_view name + + cdef struct CExtensionSetTypeRecord "arrow::engine::ExtensionSet::TypeRecord": + CSubstraitId id + shared_ptr[CDataType] type + + cdef cppclass CExtensionSet "arrow::engine::ExtensionSet": + CExtensionSet() + unordered_map[uint32_t, cpp_string_view]& uris() + CResult[uint32_t] EncodeType(const CDataType&) + CResult[CExtensionSetTypeRecord] DecodeType(uint32_t) + cdef cppclass ExtensionIdRegistry: std_vector[c_string] GetSupportedSubstraitFunctions() @@ -68,6 +82,15 @@ cdef extern from "arrow/engine/substrait/serde.h" namespace "arrow::engine" nogi CResult[CBoundExpressions] DeserializeExpressions( const CBuffer& serialized_expressions) + CResult[shared_ptr[CBuffer]] SerializeSchema( + const CSchema &schema, CExtensionSet* extension_set, + const CConversionOptions& conversion_options) + + CResult[shared_ptr[CSchema]] DeserializeSchema( + const CBuffer& serialized_schema, const CExtensionSet& extension_set, + const CConversionOptions& conversion_options) + + cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine" nogil: CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan( const CBuffer& substrait_buffer, const ExtensionIdRegistry* registry, diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 25a7945dc3ddc..f3d4e1eec0899 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -226,6 +226,11 @@ cdef class UuidType(BaseExtensionType): cdef: const CUuidType* uuid_ext_type +cdef class JsonType(BaseExtensionType): + cdef: + const CJsonType* json_ext_type + + cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index d3e2ff2e99d91..913e25e308254 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type( out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": out = UuidType.__new__(UuidType) + elif extension_name == b"arrow.json": + out = JsonType.__new__(JsonType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 68f77832c4342..2bfdcddf30736 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1044,6 +1044,12 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +class JsonScalar(ExtensionScalar): + """ + Concrete class for JSON extension scalar. + """ + + class UuidScalar(ExtensionScalar): """ Concrete class for Uuid extension scalar. diff --git a/python/pyarrow/substrait.py b/python/pyarrow/substrait.py index a2b217f4936c5..db2c3a96a1955 100644 --- a/python/pyarrow/substrait.py +++ b/python/pyarrow/substrait.py @@ -21,7 +21,10 @@ get_supported_functions, run_query, deserialize_expressions, - serialize_expressions + serialize_expressions, + deserialize_schema, + serialize_schema, + SubstraitSchema ) except ImportError as exc: raise ImportError( diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 819bbc34c66b9..af241e4be07d9 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -6259,6 +6259,57 @@ def concat_tables(tables, MemoryPool memory_pool=None, str promote_options="none return pyarrow_wrap_table(c_result_table) +def concat_batches(recordbatches, MemoryPool memory_pool=None): + """ + Concatenate pyarrow.RecordBatch objects. + + All recordbatches must share the same Schema, + the operation implies a copy of the data to merge + the arrays of the different RecordBatches. + + Parameters + ---------- + recordbatches : iterable of pyarrow.RecordBatch objects + Pyarrow record batches to concatenate into a single RecordBatch. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.record_batch([ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + ... ], names=['n_legs', 'animals']) + >>> t2 = pa.record_batch([ + ... pa.array([2, 4]), + ... pa.array(["Parrot", "Dog"]) + ... ], names=['n_legs', 'animals']) + >>> pa.concat_batches([t1,t2]) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100,2,4] + animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] + + """ + cdef: + vector[shared_ptr[CRecordBatch]] c_recordbatches + shared_ptr[CRecordBatch] c_result_recordbatch + RecordBatch recordbatch + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + for recordbatch in recordbatches: + c_recordbatches.push_back(recordbatch.sp_batch) + + with nogil: + c_result_recordbatch = GetResultValue( + ConcatenateRecordBatches(c_recordbatches, pool)) + + return pyarrow_wrap_batch(c_result_recordbatch) + + def _from_pydict(cls, mapping, schema, metadata): """ Construct a Table/RecordBatch from Arrow arrays or columns. diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 79dd96948261c..1428f80239771 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -510,3 +510,14 @@ def test_large_binary_overflow(): pa.ArrowInvalid, match="Parquet cannot store strings with size 2GB or more"): _write_table(table, writer, use_dictionary=use_dictionary) + + +@pytest.mark.parametrize("storage_type", ( + pa.string(), pa.large_string())) +def test_json_extension_type(storage_type): + data = ['{"a": 1}', '{"b": 2}', None] + arr = pa.array(data, type=pa.json_(storage_type)) + + table = pa.table([arr], names=["ext"]) + + _simple_table_roundtrip(table) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 0d3a0fbd3bec7..772670ad79fd3 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -5730,3 +5730,37 @@ def test_make_write_options_error(): msg = "make_write_options\\(\\) takes exactly 0 positional arguments" with pytest.raises(TypeError, match=msg): pformat.make_write_options(43) + + +def test_scanner_from_substrait(dataset): + try: + import pyarrow.substrait as ps + except ImportError: + pytest.skip("substrait NOT enabled") + + # SELECT str WHERE i64 = 4 + projection = (b'\nS\x08\x0c\x12Ohttps://github.com/apache/arrow/blob/main/format' + b'/substrait/extension_types.yaml\x12\t\n\x07\x08\x0c\x1a\x03u64' + b'\x12\x0b\n\t\x08\x0c\x10\x01\x1a\x03u32\x1a\x0f\n\x08\x12\x06' + b'\n\x04\x12\x02\x08\x02\x1a\x03str"i\n\x03i64\n\x03f64\n\x03str' + b'\n\x05const\n\x06struct\n\x01a\n\x01b\n\x05group\n\x03key' + b'\x127\n\x04:\x02\x10\x01\n\x04Z\x02\x10\x01\n\x04b\x02\x10' + b'\x01\n\x04:\x02\x10\x01\n\x11\xca\x01\x0e\n\x04:\x02\x10\x01' + b'\n\x04b\x02\x10\x01\x18\x01\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01') + filtering = (b'\n\x1e\x08\x06\x12\x1a/functions_comparison.yaml\nS\x08\x0c\x12' + b'Ohttps://github.com/apache/arrow/blob/main/format' + b'/substrait/extension_types.yaml\x12\x18\x1a\x16\x08\x06\x10\xc5' + b'\x01\x1a\x0fequal:any1_any1\x12\t\n\x07\x08\x0c\x1a\x03u64\x12' + b'\x0b\n\t\x08\x0c\x10\x01\x1a\x03u32\x1a\x1f\n\x1d\x1a\x1b\x08' + b'\xc5\x01\x1a\x04\n\x02\x10\x02"\x08\x1a\x06\x12\x04\n\x02\x12\x00' + b'"\x06\x1a\x04\n\x02(\x04"i\n\x03i64\n\x03f64\n\x03str\n\x05const' + b'\n\x06struct\n\x01a\n\x01b\n\x05group\n\x03key\x127\n\x04:\x02' + b'\x10\x01\n\x04Z\x02\x10\x01\n\x04b\x02\x10\x01\n\x04:\x02\x10' + b'\x01\n\x11\xca\x01\x0e\n\x04:\x02\x10\x01\n\x04b\x02\x10\x01' + b'\x18\x01\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01') + + result = dataset.scanner( + columns=ps.BoundExpressions.from_substrait(projection), + filter=ps.BoundExpressions.from_substrait(filtering) + ).to_table() + assert result.to_pydict() == {'str': ['4', '4']} diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index b74eca75bdca9..634d9ce2d8d93 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1926,3 +1926,56 @@ def test_bool8_scalar(): assert pa.scalar(1, type=pa.bool8()).as_py() is True assert pa.scalar(2, type=pa.bool8()).as_py() is True assert pa.scalar(None, type=pa.bool8()).as_py() is None + + +@pytest.mark.parametrize("storage_type", ( + pa.string(), pa.large_string(), pa.string_view())) +def test_json(storage_type, pickle_module): + data = ['{"a": 1}', '{"b": 2}', None] + json_type = pa.json_(storage_type) + storage = pa.array(data, type=storage_type) + array = pa.array(data, type=json_type) + json_arr_class = json_type.__arrow_ext_class__() + + assert pa.json_() == pa.json_(pa.utf8()) + assert json_type.extension_name == "arrow.json" + assert json_type.storage_type == storage_type + assert json_type.__class__ is pa.JsonType + + assert json_type == pa.json_(storage_type) + assert json_type != storage_type + + assert isinstance(array, pa.JsonArray) + + assert array.to_pylist() == data + assert array[0].as_py() == data[0] + assert array[2].as_py() is None + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(json_type)) + assert result == json_type + + # IPC roundtrip + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"])) + batch = ipc_read_batch(buf) + reconstructed_array = batch.column(0) + assert reconstructed_array.type == json_type + assert reconstructed_array == array + assert isinstance(array, json_arr_class) + + assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar + assert isinstance(array[0], pa.JsonScalar) + + # cast storage -> extension type + result = storage.cast(json_type) + assert result == array + + # cast extension type -> storage type + inner = array.cast(storage_type) + assert inner == storage + + for storage_type in (pa.int32(), pa.large_binary(), pa.float32()): + with pytest.raises( + pa.ArrowInvalid, + match=f"Invalid storage type for JsonExtensionType: {storage_type}"): + pa.json_(storage_type) diff --git a/python/pyarrow/tests/test_memory.py b/python/pyarrow/tests/test_memory.py index 53c25f3b3ef20..b1eef176665af 100644 --- a/python/pyarrow/tests/test_memory.py +++ b/python/pyarrow/tests/test_memory.py @@ -17,6 +17,7 @@ import contextlib import os +import platform import signal import subprocess import sys @@ -30,7 +31,7 @@ possible_backends = ["system", "jemalloc", "mimalloc"] -should_have_jemalloc = sys.platform == "linux" +should_have_jemalloc = (sys.platform == "linux" and platform.machine() == 'x86_64') should_have_mimalloc = sys.platform == "win32" diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 5d3471c7c35db..0b2055018f695 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -253,6 +253,9 @@ def test_set_timezone_db_path_non_windows(): pa.Bool8Array, pa.Bool8Scalar, pa.Bool8Type, + pa.JsonArray, + pa.JsonScalar, + pa.JsonType, ]) def test_extension_type_constructor_errors(klass): # ARROW-2638: prevent calling extension class constructors directly diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 01d468cd9e9cc..fcd1c8d48c5fc 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -105,7 +105,7 @@ def test_run_query_input_types(tmpdir, query): # Otherwise error for invalid query msg = "ParseFromZeroCopyStream failed for substrait.Plan" - with pytest.raises(OSError, match=msg): + with pytest.raises(ArrowInvalid, match=msg): substrait.run_query(query) @@ -1077,3 +1077,44 @@ def test_serializing_udfs(): assert schema == returned.schema assert len(returned.expressions) == 1 assert str(returned.expressions["expr"]) == str(exprs[0]) + + +def test_serializing_schema(): + substrait_schema = b'\n\x01x\n\x01y\x12\x0c\n\x04*\x02\x10\x01\n\x04b\x02\x10\x01' + expected_schema = pa.schema([ + pa.field("x", pa.int32()), + pa.field("y", pa.string()) + ]) + returned = pa.substrait.deserialize_schema(substrait_schema) + assert expected_schema == returned + + arrow_substrait_schema = pa.substrait.serialize_schema(returned) + assert arrow_substrait_schema.schema == substrait_schema + + returned = pa.substrait.deserialize_schema(arrow_substrait_schema) + assert expected_schema == returned + + returned = pa.substrait.deserialize_schema(arrow_substrait_schema.schema) + assert expected_schema == returned + + returned = pa.substrait.deserialize_expressions(arrow_substrait_schema.expression) + assert returned.schema == expected_schema + + +def test_bound_expression_from_Message(): + class FakeMessage: + def __init__(self, expr): + self.expr = expr + + def SerializeToString(self): + return self.expr + + # SELECT project_release, project_version + message = (b'\x1a\x1b\n\x08\x12\x06\n\x04\x12\x02\x08\x01\x1a\x0fproject_release' + b'\x1a\x19\n\x06\x12\x04\n\x02\x12\x00\x1a\x0fproject_version' + b'"0\n\x0fproject_version\n\x0fproject_release' + b'\x12\x0c\n\x04:\x02\x10\x01\n\x04b\x02\x10\x01') + exprs = pa.substrait.BoundExpressions.from_substrait(FakeMessage(message)) + assert len(exprs.expressions) == 2 + assert 'project_release' in exprs.expressions + assert 'project_version' in exprs.expressions diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b66a5eb083cc5..4c058ccecda5e 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -2037,6 +2037,49 @@ def test_table_negative_indexing(): table[4] +def test_concat_batches(): + data = [ + list(range(5)), + [-10., -5., 0., 5., 10.] + ] + data2 = [ + list(range(5, 10)), + [1., 2., 3., 4., 5.] + ] + + t1 = pa.RecordBatch.from_arrays([pa.array(x) for x in data], + names=('a', 'b')) + t2 = pa.RecordBatch.from_arrays([pa.array(x) for x in data2], + names=('a', 'b')) + + result = pa.concat_batches([t1, t2]) + result.validate() + assert len(result) == 10 + + expected = pa.RecordBatch.from_arrays([pa.array(x + y) + for x, y in zip(data, data2)], + names=('a', 'b')) + + assert result.equals(expected) + + +def test_concat_batches_different_schema(): + t1 = pa.RecordBatch.from_arrays( + [pa.array([1, 2], type=pa.int64())], ["f"]) + t2 = pa.RecordBatch.from_arrays( + [pa.array([1, 2], type=pa.float32())], ["f"]) + + with pytest.raises(pa.ArrowInvalid, + match="not match index 0 recordbatch schema"): + pa.concat_batches([t1, t2]) + + +def test_concat_batches_none_batches(): + # ARROW-11997 + with pytest.raises(AttributeError): + pa.concat_batches([None]) + + @pytest.mark.parametrize( ('cls'), [ diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 2a05f87615074..fef350d5de958 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -1153,6 +1153,13 @@ def test_field_basic(): pa.field('foo', None) +def test_field_datatype_alias(): + f = pa.field('foo', 'string') + + assert f.name == 'foo' + assert f.type is pa.string() + + def test_field_equals(): meta1 = {b'foo': b'bar'} meta2 = {b'bizz': b'bazz'} diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 70f12e9796e80..4aa8238556a9c 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar +cdef class JsonType(BaseExtensionType): + """ + Concrete class for JSON extension type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.json_ext_type = type.get() + + def __arrow_ext_class__(self): + return JsonArray + + def __reduce__(self): + return json_, (self.storage_type,) + + def __arrow_ext_scalar_class__(self): + return JsonScalar + + cdef class UuidType(BaseExtensionType): """ Concrete class for UUID extension type. @@ -3676,8 +3713,8 @@ def field(name, type=None, nullable=None, metadata=None): Name of the field. Alternatively, you can also pass an object that implements the Arrow PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). - type : pyarrow.DataType - Arrow datatype of the field. + type : pyarrow.DataType or str + Arrow datatype of the field or a string matching one. nullable : bool, default True Whether the field's values are nullable. metadata : dict, default None @@ -3709,6 +3746,11 @@ def field(name, type=None, nullable=None, metadata=None): >>> pa.struct([field]) StructType(struct) + + A str can also be passed for the type parameter: + + >>> pa.field('key', 'int32') + pyarrow.Field """ if hasattr(name, "__arrow_c_schema__"): if type is not None: @@ -5296,6 +5338,44 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def json_(DataType storage_type=utf8()): + """ + Create instance of JSON extension type. + + Parameters + ---------- + storage_type : DataType, default pyarrow.string() + The underlying data type. Can be on of the following types: + string, large_string, string_view. + + Returns + ------- + type : JsonType + + Examples + -------- + Create an instance of JSON extension type: + + >>> import pyarrow as pa + >>> pa.json_(pa.utf8()) + JsonType(extension) + + Use the JSON type to create an array: + + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) + + [ + "{"a": 1}", + "{"b": 2}" + ] + """ + + cdef JsonType out = JsonType.__new__(JsonType) + c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type)) + out.init(c_json_ext_type) + return out + + def uuid(): """ Create UuidType instance. @@ -5642,6 +5722,25 @@ def schema(fields, metadata=None): some_int: int32 some_string: string + DataTypes can also be passed as strings. The following is equivalent to the + above example: + + >>> pa.schema([ + ... pa.field('some_int', "int32"), + ... pa.field('some_string', "string") + ... ]) + some_int: int32 + some_string: string + + Or more concisely: + + >>> pa.schema([ + ... ('some_int', "int32"), + ... ('some_string', "string") + ... ]) + some_int: int32 + some_string: string + Returns ------- schema : pyarrow.Schema diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp index bdaac0a9ce5d2..90a459e19cb6d 100644 --- a/r/src/altrep.cpp +++ b/r/src/altrep.cpp @@ -80,6 +80,14 @@ void DeletePointer(std::shared_ptr* ptr) { template using Pointer = cpp11::external_pointer, DeletePointer>; +#if ARROW_VERSION_MAJOR >= 18 +using ChunkResolver = arrow::ChunkResolver; +using ChunkLocation = arrow::ChunkLocation; +#else +using ChunkResolver = arrow::internal::ChunkResolver; +using ChunkLocation = arrow::internal::ChunkLocation; +#endif + class ArrowAltrepData { public: explicit ArrowAltrepData(const std::shared_ptr& chunked_array) @@ -87,13 +95,11 @@ class ArrowAltrepData { const std::shared_ptr& chunked_array() { return chunked_array_; } - arrow::internal::ChunkLocation locate(int64_t index) { - return resolver_.Resolve(index); - } + ChunkLocation locate(int64_t index) { return resolver_.Resolve(index); } private: std::shared_ptr chunked_array_; - arrow::internal::ChunkResolver resolver_; + ChunkResolver resolver_; }; // the ChunkedArray that is being wrapped by the altrep object diff --git a/swift/Arrow/Package.swift b/swift/Arrow/Package.swift index 85407ea1b96b8..f589232fe95f4 100644 --- a/swift/Arrow/Package.swift +++ b/swift/Arrow/Package.swift @@ -31,11 +31,7 @@ let package = Package( targets: ["Arrow"]) ], dependencies: [ - // The latest version of flatbuffers v23.5.26 was built in May 26, 2023 - // and therefore doesn't include the unaligned buffer swift changes. - // This can be changed back to using the tag once a new version of - // flatbuffers has been released. - .package(url: "https://github.com/google/flatbuffers.git", branch: "master"), + .package(url: "https://github.com/google/flatbuffers.git", branch: "v24.3.7"), .package( url: "https://github.com/apple/swift-atomics.git", .upToNextMajor(from: "1.2.0") // or `.upToNextMinor