diff --git a/.env b/.env index 427a4ab0bf398..eb87dc62bdd8c 100644 --- a/.env +++ b/.env @@ -92,13 +92,13 @@ DEVTOOLSET_VERSION= # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. -VCPKG="501db0f17ef6df184fcdbfbe0f87cde2313b6ab1" # 2023.04.15 Release +VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release # This must be updated when we update -# ci/docker/python-wheel-windows-vs2017.dockerfile. +# ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2023-08-02 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-02-05 # Use conanio/${CONAN} for "docker-compose run --rm conan". See # https://github.com/conan-io/conan-docker-tools#readme for available diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 9fbad06692bd2..e9409f1cd6248 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -57,37 +57,65 @@ env: DOCKER_VOLUME_PREFIX: ".docker/" jobs: + docker-targets: + name: Docker targets + runs-on: ubuntu-latest + outputs: + targets: ${{ steps.detect-targets.outputs.targets }} + steps: + - name: Detect targets + id: detect-targets + run: | + echo "targets<> "$GITHUB_OUTPUT" + echo "[" >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch": "amd64", + "clang-tools": "14", + "image": "conda-cpp", + "llvm": "14", + "runs-on": "ubuntu-latest", + "simd-level": "AVX2", + "title": "AMD64 Conda C++ AVX2", + "ubuntu": "22.04" + }, + { + "arch": "amd64", + "clang-tools": "14", + "image": "ubuntu-cpp-sanitizer", + "llvm": "14", + "runs-on": "ubuntu-latest", + "title": "AMD64 Ubuntu 22.04 C++ ASAN UBSAN", + "ubuntu": "22.04" + } + JSON + if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then + echo "," >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch": "arm64v8", + "clang-tools": "10", + "image": "ubuntu-cpp", + "llvm": "10", + "runs-on": ["self-hosted", "arm", "linux"], + "title": "ARM64 Ubuntu 20.04 C++", + "ubuntu": "20.04" + } + JSON + fi + echo "]" >> "$GITHUB_OUTPUT" + echo "JSON" >> "$GITHUB_OUTPUT" + docker: name: ${{ matrix.title }} + needs: docker-targets runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 strategy: fail-fast: false matrix: - include: - - arch: amd64 - clang-tools: "14" - image: conda-cpp - llvm: "14" - runs-on: ubuntu-latest - simd-level: AVX2 - title: AMD64 Conda C++ AVX2 - ubuntu: "22.04" - - arch: amd64 - clang-tools: "14" - image: ubuntu-cpp-sanitizer - llvm: "14" - runs-on: ubuntu-latest - title: AMD64 Ubuntu 22.04 C++ ASAN UBSAN - ubuntu: "22.04" - - arch: arm64v8 - clang-tools: "10" - image: ubuntu-cpp - llvm: "10" - runs-on: ["self-hosted", "arm", "linux"] - title: ARM64 Ubuntu 20.04 C++ - ubuntu: "20.04" + include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index cd44e65e8811b..bbffab6704087 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -43,31 +43,62 @@ permissions: jobs: + docker-targets: + name: Docker targets + runs-on: ubuntu-latest + if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + outputs: + targets: ${{ steps.detect-targets.outputs.targets }} + steps: + - name: Detect targets + id: detect-targets + run: | + echo "targets<> "$GITHUB_OUTPUT" + echo "[" >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch-label": "AMD64", + "arch": "amd64", + "go": "1.19", + "runs-on": "ubuntu-latest" + }, + { + "arch-label": "AMD64", + "arch": "amd64", + "go": "1.20", + "runs-on": "ubuntu-latest" + } + JSON + if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then + echo "," >> "$GITHUB_OUTPUT" + cat <> "$GITHUB_OUTPUT" + { + "arch-label": "ARM64", + "arch": "arm64v8", + "go": "1.19", + "runs-on": ["self-hosted", "arm", "linux"] + }, + { + "arch-label": "ARM64", + "arch": "arm64v8", + "go": "1.20", + "runs-on": ["self-hosted", "arm", "linux"] + } + JSON + fi + echo "]" >> "$GITHUB_OUTPUT" + echo "JSON" >> "$GITHUB_OUTPUT" + docker: name: ${{ matrix.arch-label }} Debian 11 Go ${{ matrix.go }} + needs: docker-targets runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 strategy: fail-fast: false matrix: - include: - - arch-label: AMD64 - arch: amd64 - go: 1.19 - runs-on: ubuntu-latest - - arch-label: AMD64 - arch: amd64 - go: '1.20' - runs-on: ubuntu-latest - - arch-label: ARM64 - arch: arm64v8 - go: 1.19 - runs-on: ["self-hosted", "arm", "linux"] - - arch-label: ARM64 - arch: arm64v8 - go: '1.20' - runs-on: ["self-hosted", "arm", "linux"] + include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} GO: ${{ matrix.go }} diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 2a801b6040ec8..3d1f75ede4bb5 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -21,24 +21,26 @@ on: push: paths: - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - "ci/docker/**" + - "ci/etc/rprofile" + - "ci/scripts/PKGBUILD" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/install_minio.sh" + - "ci/scripts/r_*.sh" - "cpp/**" - - 'docker-compose.yml' + - "docker-compose.yml" - "r/**" pull_request: paths: - ".github/workflows/r.yml" - - "ci/scripts/r_*.sh" - - "ci/scripts/cpp_*.sh" - - "ci/scripts/PKGBUILD" - - "ci/etc/rprofile" - "ci/docker/**" + - "ci/etc/rprofile" + - "ci/scripts/PKGBUILD" + - "ci/scripts/cpp_*.sh" + - "ci/scripts/install_minio.sh" + - "ci/scripts/r_*.sh" - "cpp/**" - - 'docker-compose.yml' + - "docker-compose.yml" - "r/**" concurrency: @@ -256,6 +258,16 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + # This must be done before r-lib/actions/setup-r because curl in + # Rtools doesn't work on non Rtools' MSYS2 environment. If we + # use "shell: bash" after r-lib/actions/setup-r, bash in Rtools + # is used on non Rtools' MSYS2 environment. + - name: Install MinIO + shell: bash + run: | + mkdir -p "$HOME/.local/bin" + ci/scripts/install_minio.sh latest "$HOME/.local" + echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts uses: actions/download-artifact@v3 @@ -282,15 +294,6 @@ jobs: working-directory: 'r' extra-packages: | any::rcmdcheck - - name: Install MinIO - shell: bash - run: | - mkdir -p "$HOME/.local/bin" - curl \ - --output "$HOME/.local/bin/minio.exe" \ - https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z - chmod +x "$HOME/.local/bin/minio.exe" - echo "$HOME/.local/bin" >> $GITHUB_PATH # TODO(ARROW-17149): figure out why the GCS tests are hanging on Windows # - name: Install Google Cloud Storage Testbench # shell: bash diff --git a/c_glib/README.md b/c_glib/README.md index 2a4d6b8a6628c..24e69eff65055 100644 --- a/c_glib/README.md +++ b/c_glib/README.md @@ -101,7 +101,7 @@ $ sudo meson install -C c_glib.build You need to install Arrow C++ before you install Arrow GLib. See Arrow C++ document about how to install Arrow C++. -You need [GTK-Doc](https://www.gtk.org/gtk-doc/) and +You need [GTK-Doc](https://gitlab.gnome.org/GNOME/gtk-doc) and [GObject Introspection](https://wiki.gnome.org/Projects/GObjectIntrospection) to build Arrow GLib. You can install them by the followings: diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index a07c727ac76fa..2831440d5a967 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -62,15 +62,16 @@ COPY ci/vcpkg/*.patch \ COPY ci/scripts/install_vcpkg.sh \ arrow/ci/scripts/ ENV VCPKG_ROOT=/opt/vcpkg -RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} -ENV PATH="${PATH}:${VCPKG_ROOT}" - ARG build_type=release ENV CMAKE_BUILD_TYPE=${build_type} \ VCPKG_FORCE_SYSTEM_BINARIES=1 \ VCPKG_OVERLAY_TRIPLETS=/arrow/ci/vcpkg \ VCPKG_DEFAULT_TRIPLET=${arch_short}-linux-static-${build_type} \ VCPKG_FEATURE_FLAGS="manifests" + +RUN arrow/ci/scripts/install_vcpkg.sh ${VCPKG_ROOT} ${vcpkg} +ENV PATH="${PATH}:${VCPKG_ROOT}" + COPY ci/vcpkg/vcpkg.json arrow/ci/vcpkg/ # cannot use the S3 feature here because while aws-sdk-cpp=1.9.160 contains # ssl related fixes as well as we can patch the vcpkg portfile to support diff --git a/ci/docker/python-wheel-windows-test-vs2017.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile similarity index 96% rename from ci/docker/python-wheel-windows-test-vs2017.dockerfile rename to ci/docker/python-wheel-windows-test-vs2019.dockerfile index e842ede18454b..67d99fa9c5724 100644 --- a/ci/docker/python-wheel-windows-test-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -19,8 +19,8 @@ # when you update this file. # based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2017 preinstalled -FROM abrarov/msvc-2017:2.11.0 +# contains choco and vs2019 preinstalled +FROM abrarov/msvc-2019:2.11.0 # Add unix tools to path RUN setx path "%path%;C:\Program Files\Git\usr\bin" diff --git a/ci/docker/python-wheel-windows-vs2017.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile similarity index 98% rename from ci/docker/python-wheel-windows-vs2017.dockerfile rename to ci/docker/python-wheel-windows-vs2019.dockerfile index 067105b3a7995..b8e8aad952b1c 100644 --- a/ci/docker/python-wheel-windows-vs2017.dockerfile +++ b/ci/docker/python-wheel-windows-vs2019.dockerfile @@ -19,8 +19,8 @@ # when you update this file. # based on mcr.microsoft.com/windows/servercore:ltsc2019 -# contains choco and vs2017 preinstalled -FROM abrarov/msvc-2017:2.11.0 +# contains choco and vs2019 preinstalled +FROM abrarov/msvc-2019:2.11.0 # Install CMake and Ninja ARG cmake=3.21.4 diff --git a/ci/scripts/install_minio.sh b/ci/scripts/install_minio.sh index 6ea8e1a095c39..e493a183b4543 100755 --- a/ci/scripts/install_minio.sh +++ b/ci/scripts/install_minio.sh @@ -17,7 +17,15 @@ # specific language governing permissions and limitations # under the License. -set -e +set -eu + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +version=$1 +prefix=$2 declare -A archs archs=([x86_64]=amd64 @@ -25,45 +33,60 @@ archs=([x86_64]=amd64 [aarch64]=arm64 [s390x]=s390x) -declare -A platforms -platforms=([Linux]=linux - [Darwin]=darwin) - arch=$(uname -m) -platform=$(uname) -version=$1 -prefix=$2 - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - exit 1 -elif [ -z ${archs[$arch]} ]; then +if [ -z ${archs[$arch]} ]; then echo "Unsupported architecture: ${arch}" exit 0 -elif [ -z ${platforms[$platform]} ]; then - echo "Unsupported platform: ${platform}" - exit 0 -elif [ "${version}" != "latest" ]; then +fi +arch=${archs[$arch]} + +platform=$(uname) +case ${platform} in + Linux) + platform=linux + ;; + Darwin) + platform=darwin + ;; + MSYS_NT*|MINGW64_NT*) + platform=windows + ;; + *) + echo "Unsupported platform: ${platform}" + exit 0 + ;; +esac + +if [ "${version}" != "latest" ]; then echo "Cannot fetch specific versions of minio, only latest is supported." exit 1 fi -arch=${archs[$arch]} -platform=${platforms[$platform]} - # Use specific versions for minio server and client to avoid CI failures on new releases. minio_version="minio.RELEASE.2022-05-26T05-48-41Z" mc_version="mc.RELEASE.2022-05-09T04-08-26Z" +download() +{ + local output=$1 + local url=$2 + + if type wget > /dev/null 2>&1; then + wget -nv --output-document ${output} ${url} + else + curl --fail --location --output ${output} ${url} + fi +} + if [[ ! -x ${prefix}/bin/minio ]]; then url="https://dl.min.io/server/minio/release/${platform}-${arch}/archive/${minio_version}" echo "Fetching ${url}..." - wget -nv --output-document ${prefix}/bin/minio ${url} + download ${prefix}/bin/minio ${url} chmod +x ${prefix}/bin/minio fi if [[ ! -x ${prefix}/bin/mc ]]; then url="https://dl.min.io/client/mc/release/${platform}-${arch}/archive/${mc_version}" echo "Fetching ${url}..." - wget -nv --output-document ${prefix}/bin/mc ${url} + download ${prefix}/bin/mc ${url} chmod +x ${prefix}/bin/mc fi diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index ffb43b3481e55..73b0192d9bc97 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -19,7 +19,7 @@ echo "Building windows wheel..." -call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" +call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat" echo "=== (%PYTHON_VERSION%) Clear output directories and leftovers ===" del /s /q C:\arrow-build @@ -50,7 +50,8 @@ set ARROW_WITH_SNAPPY=ON set ARROW_WITH_ZLIB=ON set ARROW_WITH_ZSTD=ON set CMAKE_UNITY_BUILD=ON -set CMAKE_GENERATOR=Visual Studio 15 2017 Win64 +set CMAKE_GENERATOR=Visual Studio 16 2019 +set CMAKE_PLATFORM=x64 set VCPKG_ROOT=C:\vcpkg set VCPKG_FEATURE_FLAGS=-manifests set VCGPK_TARGET_TRIPLET=amd64-windows-static-md-%CMAKE_BUILD_TYPE% @@ -96,6 +97,7 @@ cmake ^ -DVCPKG_MANIFEST_MODE=OFF ^ -DVCPKG_TARGET_TRIPLET=%VCGPK_TARGET_TRIPLET% ^ -G "%CMAKE_GENERATOR%" ^ + -A "%CMAKE_PLATFORM%" ^ C:\arrow\cpp || exit /B 1 cmake --build . --config %CMAKE_BUILD_TYPE% --target install || exit /B 1 popd @@ -121,6 +123,6 @@ set CMAKE_PREFIX_PATH=C:\arrow-dist pushd C:\arrow\python @REM bundle the msvc runtime -cp "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Redist\MSVC\14.16.27012\x64\Microsoft.VC141.CRT\msvcp140.dll" pyarrow\ +cp "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Redist\MSVC\14.28.29325\x64\Microsoft.VC142.CRT\msvcp140.dll" pyarrow\ python setup.py bdist_wheel || exit /B 1 popd diff --git a/ci/vcpkg/ports.patch b/ci/vcpkg/ports.patch index 68f6cae5addc9..0d4fb540a2003 100644 --- a/ci/vcpkg/ports.patch +++ b/ci/vcpkg/ports.patch @@ -1,13 +1,14 @@ diff --git a/ports/curl/portfile.cmake b/ports/curl/portfile.cmake -index 5a14562..924b1b7 100644 +index bdc544e9e..53f6bbc3b 100644 --- a/ports/curl/portfile.cmake +++ b/ports/curl/portfile.cmake -@@ -87,8 +87,11 @@ vcpkg_cmake_configure( +@@ -74,9 +74,12 @@ vcpkg_cmake_configure( -DENABLE_MANUAL=OFF -DCURL_CA_FALLBACK=ON -DCURL_USE_LIBPSL=OFF + -DCURL_CA_PATH=none + -DCURL_CA_BUNDLE=none + -DCMAKE_DISABLE_FIND_PACKAGE_Perl=ON OPTIONS_DEBUG -DENABLE_DEBUG=ON + ${EXTRA_ARGS_DEBUG} @@ -15,29 +16,29 @@ index 5a14562..924b1b7 100644 vcpkg_cmake_install() vcpkg_copy_pdbs() diff --git a/ports/snappy/portfile.cmake b/ports/snappy/portfile.cmake -index 8f3f3f9..745b0fb 100644 +index 0c7098082..c603c3653 100644 --- a/ports/snappy/portfile.cmake +++ b/ports/snappy/portfile.cmake -@@ -9,6 +9,7 @@ vcpkg_from_github( - HEAD_REF master +@@ -10,6 +10,7 @@ vcpkg_from_github( PATCHES fix_clang-cl_build.patch + no-werror.patch + "snappy-disable-bmi.patch" ) vcpkg_cmake_configure( diff --git a/ports/snappy/snappy-disable-bmi.patch b/ports/snappy/snappy-disable-bmi.patch new file mode 100644 -index 0000000..a57ce0c +index 000000000..e839c93a4 --- /dev/null +++ b/ports/snappy/snappy-disable-bmi.patch @@ -0,0 +1,19 @@ +diff --git a/snappy.cc b/snappy.cc -+index 79dc0e8..f3153ea 100644 ++index d414718..7b49d2a 100644 +--- a/snappy.cc ++++ b/snappy.cc -+@@ -965,14 +965,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, -+ static inline uint32_t ExtractLowBytes(uint32_t v, int n) { ++@@ -1014,14 +1014,10 @@ static inline void Report(const char *algorithm, size_t compressed_size, ++ static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) { + assert(n >= 0); + assert(n <= 4); +-#if SNAPPY_HAVE_BMI2 @@ -52,13 +53,13 @@ index 0000000..a57ce0c + + static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { diff --git a/ports/llvm/portfile.cmake b/ports/llvm/portfile.cmake -index 4d7e26a..1f054a2 100644 +index bf9397b66..c3112b673 100644 --- a/ports/llvm/portfile.cmake +++ b/ports/llvm/portfile.cmake -@@ -274,6 +274,8 @@ vcpkg_cmake_configure( +@@ -293,6 +293,8 @@ vcpkg_cmake_configure( + ${FEATURE_OPTIONS} + MAYBE_UNUSED_VARIABLES COMPILER_RT_ENABLE_IOS - OPENMP_TOOLS_INSTALL_DIR - MLIR_TOOLS_INSTALL_DIR + BOLT_TOOLS_INSTALL_DIR + LIBOMP_INSTALL_ALIASES ) diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index 71c23165e61f0..99771728ecf18 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -81,8 +81,11 @@ "default-features": false, "features": [ "clang", - "default-options", "default-targets", + "enable-bindings", + "enable-terminfo", + "enable-zlib", + "enable-zstd", "enable-rtti", "lld", "tools" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 016cd8a1b9ec8..50a85b33d5489 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -152,6 +152,7 @@ set(ARROW_DOC_DIR "share/doc/${PROJECT_NAME}") set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(ARROW_LLVM_VERSIONS + "18.1" "17.0" "16.0" "15.0" diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 6bb9c0f6af2ca..b16ee07756013 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2594,16 +2594,11 @@ macro(build_re2) endmacro() if(ARROW_WITH_RE2) - # Don't specify "PC_PACKAGE_NAMES re2" here because re2.pc may - # include -std=c++11. It's not compatible with C source and C++ - # source not uses C++ 11. - resolve_dependency(re2 HAVE_ALT TRUE) - if(${re2_SOURCE} STREQUAL "SYSTEM" AND ARROW_BUILD_STATIC) - get_target_property(RE2_TYPE re2::re2 TYPE) - if(NOT RE2_TYPE STREQUAL "INTERFACE_LIBRARY") - string(APPEND ARROW_PC_LIBS_PRIVATE " $") - endif() - endif() + resolve_dependency(re2 + HAVE_ALT + TRUE + PC_PACKAGE_NAMES + re2) add_definitions(-DARROW_WITH_RE2) endif() @@ -2634,7 +2629,7 @@ macro(build_bzip2) BUILD_IN_SOURCE 1 BUILD_COMMAND ${MAKE} libbz2.a ${MAKE_BUILD_ARGS} ${BZIP2_EXTRA_ARGS} - INSTALL_COMMAND ${MAKE} install PREFIX=${BZIP2_PREFIX} + INSTALL_COMMAND ${MAKE} install -j1 PREFIX=${BZIP2_PREFIX} ${BZIP2_EXTRA_ARGS} INSTALL_DIR ${BZIP2_PREFIX} URL ${ARROW_BZIP2_SOURCE_URL} diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 172ed8962ce77..119249da99a6d 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -914,6 +914,8 @@ struct DecodedMetadata { std::shared_ptr metadata; std::string extension_name; std::string extension_serialized; + int extension_name_index = -1; // index of extension_name in metadata + int extension_serialized_index = -1; // index of extension_serialized in metadata }; Result DecodeMetadata(const char* metadata) { @@ -956,8 +958,10 @@ Result DecodeMetadata(const char* metadata) { RETURN_NOT_OK(read_string(&values[i])); if (keys[i] == kExtensionTypeKeyName) { decoded.extension_name = values[i]; + decoded.extension_name_index = i; } else if (keys[i] == kExtensionMetadataKeyName) { decoded.extension_serialized = values[i]; + decoded.extension_serialized_index = i; } } decoded.metadata = key_value_metadata(std::move(keys), std::move(values)); @@ -1046,6 +1050,8 @@ struct SchemaImporter { ARROW_ASSIGN_OR_RAISE( type_, registered_ext_type->Deserialize(std::move(type_), metadata_.extension_serialized)); + RETURN_NOT_OK(metadata_.metadata->DeleteMany( + {metadata_.extension_name_index, metadata_.extension_serialized_index})); } } @@ -1537,6 +1543,8 @@ struct ArrayImporter { if (recursion_level_ >= kMaxImportRecursionLevel) { return Status::Invalid("Recursion level in ArrowArray struct exceeded"); } + device_type_ = parent->device_type_; + memory_mgr_ = parent->memory_mgr_; // Child buffers will keep the entire parent import alive. // Perhaps we can move the child structs to an owned area // when the parent ImportedArrayData::Release() gets called, @@ -1851,10 +1859,25 @@ struct ArrayImporter { template Status ImportStringValuesBuffer(int32_t offsets_buffer_id, int32_t buffer_id, int64_t byte_width = 1) { - auto offsets = data_->GetValues(offsets_buffer_id); + if (device_type_ == DeviceAllocationType::kCPU) { + auto offsets = data_->GetValues(offsets_buffer_id); + // Compute visible size of buffer + int64_t buffer_size = + (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; + return ImportBuffer(buffer_id, buffer_size); + } + + // we only need the value of the last offset so let's just copy that + // one value from device to host. + auto single_value_buf = + SliceBuffer(data_->buffers[offsets_buffer_id], + c_struct_->length * sizeof(OffsetType), sizeof(OffsetType)); + ARROW_ASSIGN_OR_RAISE( + auto cpubuf, Buffer::ViewOrCopy(single_value_buf, default_cpu_memory_manager())); + auto offsets = cpubuf->data_as(); // Compute visible size of buffer - int64_t buffer_size = - (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0; + int64_t buffer_size = (c_struct_->length > 0) ? byte_width * offsets[0] : 0; + return ImportBuffer(buffer_id, buffer_size); } diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 321ec36c38d8c..b8d5e0fcd3845 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -1872,7 +1872,7 @@ class TestSchemaImport : public ::testing::Test, public SchemaStructBuilder { ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); Reset(); // for further tests cb.AssertCalled(); // was released - AssertTypeEqual(*expected, *type); + AssertTypeEqual(*expected, *type, /*check_metadata=*/true); } void CheckImport(const std::shared_ptr& expected) { @@ -1892,7 +1892,7 @@ class TestSchemaImport : public ::testing::Test, public SchemaStructBuilder { ASSERT_TRUE(ArrowSchemaIsReleased(&c_struct_)); Reset(); // for further tests cb.AssertCalled(); // was released - AssertSchemaEqual(*expected, *schema); + AssertSchemaEqual(*expected, *schema, /*check_metadata=*/true); } void CheckImportError() { @@ -3571,7 +3571,7 @@ class TestSchemaRoundtrip : public ::testing::Test { // Recreate the type ASSERT_OK_AND_ASSIGN(actual, ImportType(&c_schema)); type = factory_expected(); - AssertTypeEqual(*type, *actual); + AssertTypeEqual(*type, *actual, /*check_metadata=*/true); type.reset(); actual.reset(); @@ -3602,7 +3602,7 @@ class TestSchemaRoundtrip : public ::testing::Test { // Recreate the schema ASSERT_OK_AND_ASSIGN(actual, ImportSchema(&c_schema)); schema = factory(); - AssertSchemaEqual(*schema, *actual); + AssertSchemaEqual(*schema, *actual, /*check_metadata=*/true); schema.reset(); actual.reset(); @@ -3695,13 +3695,27 @@ TEST_F(TestSchemaRoundtrip, Dictionary) { } } +// Given an extension type, return a field of its storage type + the +// serialized extension metadata. +std::shared_ptr GetStorageWithMetadata(const std::string& field_name, + const std::shared_ptr& type) { + const auto& ext_type = checked_cast(*type); + auto storage_type = ext_type.storage_type(); + auto md = KeyValueMetadata::Make({kExtensionTypeKeyName, kExtensionMetadataKeyName}, + {ext_type.extension_name(), ext_type.Serialize()}); + return field(field_name, storage_type, /*nullable=*/true, md); +} + TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { TestWithTypeFactory(uuid, []() { return fixed_size_binary(16); }); TestWithTypeFactory(dict_extension_type, []() { return dictionary(int8(), utf8()); }); - // Inside nested type - TestWithTypeFactory([]() { return list(dict_extension_type()); }, - []() { return list(dictionary(int8(), utf8())); }); + // Inside nested type. + // When an extension type is not known by the importer, it is imported + // as its storage type and the extension metadata is preserved on the field. + TestWithTypeFactory( + []() { return list(dict_extension_type()); }, + []() { return list(GetStorageWithMetadata("item", dict_extension_type())); }); } TEST_F(TestSchemaRoundtrip, RegisteredExtension) { @@ -3710,7 +3724,9 @@ TEST_F(TestSchemaRoundtrip, RegisteredExtension) { TestWithTypeFactory(dict_extension_type); TestWithTypeFactory(complex128); - // Inside nested type + // Inside nested type. + // When the extension type is registered, the extension metadata is removed + // from the storage type's field to ensure roundtripping (GH-39865). TestWithTypeFactory([]() { return list(uuid()); }); TestWithTypeFactory([]() { return list(dict_extension_type()); }); TestWithTypeFactory([]() { return list(complex128()); }); @@ -3810,7 +3826,7 @@ class TestArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory_expected())); - AssertTypeEqual(*expected->type(), *array->type()); + AssertTypeEqual(*expected->type(), *array->type(), /*check_metadata=*/true); AssertArraysEqual(*expected, *array, true); } array.reset(); @@ -3850,7 +3866,7 @@ class TestArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory())); - AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertSchemaEqual(*expected->schema(), *batch->schema(), /*check_metadata=*/true); AssertBatchesEqual(*expected, *batch); } batch.reset(); @@ -4230,7 +4246,7 @@ class TestDeviceArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory_expected())); - AssertTypeEqual(*expected->type(), *array->type()); + AssertTypeEqual(*expected->type(), *array->type(), /*check_metadata=*/true); AssertArraysEqual(*expected, *array, true); } array.reset(); @@ -4276,7 +4292,7 @@ class TestDeviceArrayRoundtrip : public ::testing::Test { { std::shared_ptr expected; ASSERT_OK_AND_ASSIGN(expected, ToResult(factory())); - AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertSchemaEqual(*expected->schema(), *batch->schema(), /*check_metadata=*/true); AssertBatchesEqual(*expected, *batch); } batch.reset(); @@ -4304,6 +4320,16 @@ TEST_F(TestDeviceArrayRoundtrip, Primitive) { TestWithJSON(mm, int32(), "[4, 5, null]"); } +TEST_F(TestDeviceArrayRoundtrip, Struct) { + std::shared_ptr device = std::make_shared(1); + auto mm = device->default_memory_manager(); + auto type = struct_({field("ints", int16()), field("strs", utf8())}); + + TestWithJSON(mm, type, "[]"); + TestWithJSON(mm, type, R"([[4, "foo"], [5, "bar"]])"); + TestWithJSON(mm, type, R"([[4, null], null, [5, "foo"]])"); +} + //////////////////////////////////////////////////////////////////////////// // Array stream export tests @@ -4353,7 +4379,7 @@ class TestArrayStreamExport : public BaseArrayStreamTest { SchemaExportGuard schema_guard(&c_schema); ASSERT_FALSE(ArrowSchemaIsReleased(&c_schema)); ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); - AssertSchemaEqual(expected, *schema); + AssertSchemaEqual(expected, *schema, /*check_metadata=*/true); } void AssertStreamEnd(struct ArrowArrayStream* c_stream) { @@ -4437,7 +4463,7 @@ TEST_F(TestArrayStreamExport, ArrayLifetime) { { SchemaExportGuard schema_guard(&c_schema); ASSERT_OK_AND_ASSIGN(auto got_schema, ImportSchema(&c_schema)); - AssertSchemaEqual(*schema, *got_schema); + AssertSchemaEqual(*schema, *got_schema, /*check_metadata=*/true); } ASSERT_GT(pool_->bytes_allocated(), orig_allocated_); @@ -4462,7 +4488,7 @@ TEST_F(TestArrayStreamExport, Errors) { { SchemaExportGuard schema_guard(&c_schema); ASSERT_OK_AND_ASSIGN(auto schema, ImportSchema(&c_schema)); - AssertSchemaEqual(schema, arrow::schema({})); + AssertSchemaEqual(schema, arrow::schema({}), /*check_metadata=*/true); } struct ArrowArray c_array; @@ -4539,7 +4565,7 @@ TEST_F(TestArrayStreamRoundtrip, Simple) { ASSERT_OK_AND_ASSIGN(auto reader, RecordBatchReader::Make(batches, orig_schema)); Roundtrip(std::move(reader), [&](const std::shared_ptr& reader) { - AssertSchemaEqual(*orig_schema, *reader->schema()); + AssertSchemaEqual(*orig_schema, *reader->schema(), /*check_metadata=*/true); AssertReaderNext(reader, *batches[0]); AssertReaderNext(reader, *batches[1]); AssertReaderEnd(reader); diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index b47e0a35525c5..8c59ad1df86f2 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -761,6 +761,15 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i } } + int64_t input_length; + if (!arguments.empty() && all_scalar) { + // all inputs are scalar, so use a 1-long batch to avoid + // computing input.length equivalent outputs + input_length = 1; + } else { + input_length = input.length; + } + auto executor = compute::detail::KernelExecutor::MakeScalar(); compute::KernelContext kernel_context(exec_context, call->kernel); @@ -772,8 +781,8 @@ Result ExecuteScalarExpression(const Expression& expr, const ExecBatch& i RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, types, options})); compute::detail::DatumAccumulator listener; - RETURN_NOT_OK(executor->Execute( - ExecBatch(std::move(arguments), all_scalar ? 1 : input.length), &listener)); + RETURN_NOT_OK( + executor->Execute(ExecBatch(std::move(arguments), input_length), &listener)); const auto out = executor->WrapResults(arguments, listener.values()); #ifndef NDEBUG DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str())); diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index 44159e76600fb..d33c348cd77da 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -863,6 +863,25 @@ TEST(Expression, ExecuteCall) { ])")); } +TEST(Expression, ExecuteCallWithNoArguments) { + const int kCount = 10; + auto random_options = RandomOptions::FromSeed(/*seed=*/0); + ExecBatch input({}, kCount); + + Expression random_expr = call("random", {}, random_options); + ASSERT_OK_AND_ASSIGN(random_expr, random_expr.Bind(float64())); + + ASSERT_OK_AND_ASSIGN(Datum actual, ExecuteScalarExpression(random_expr, input)); + compute::ExecContext* exec_context = default_exec_context(); + ASSERT_OK_AND_ASSIGN(auto function, + exec_context->func_registry()->GetFunction("random")); + ASSERT_OK_AND_ASSIGN(Datum expected, + function->Execute(input, &random_options, exec_context)); + AssertDatumsEqual(actual, expected, /*verbose=*/true); + + EXPECT_EQ(actual.length(), kCount); +} + TEST(Expression, ExecuteDictionaryTransparent) { ExpectExecute( equal(field_ref("a"), field_ref("b")), diff --git a/cpp/src/arrow/csv/parser_test.cc b/cpp/src/arrow/csv/parser_test.cc index 960a69c59db5d..dd3d025202018 100644 --- a/cpp/src/arrow/csv/parser_test.cc +++ b/cpp/src/arrow/csv/parser_test.cc @@ -175,6 +175,13 @@ void AssertParsePartial(BlockParser& parser, const std::string& str, ASSERT_EQ(parsed_size, expected_size); } +void AssertParsePartial(BlockParser& parser, const std::vector& data, + uint32_t expected_size) { + uint32_t parsed_size = static_cast(-1); + ASSERT_OK(parser.Parse(data, &parsed_size)); + ASSERT_EQ(parsed_size, expected_size); +} + void AssertLastRowEq(const BlockParser& parser, const std::vector& expected) { std::vector values; @@ -376,6 +383,21 @@ TEST(BlockParser, TruncatedData) { } } +TEST(BlockParser, TruncatedDataViews) { + // The BlockParser API mandates that, when passing a vector of views, + // only the last view may be a truncated CSV block. + // In the current implementation, receiving a truncated non-last view + // simply stops parsing after that view. + BlockParser parser(ParseOptions::Defaults(), /*num_cols=*/3); + AssertParsePartial(parser, Views({"a,b,", "c\n"}), 0); + AssertParsePartial(parser, Views({"a,b,c\nd,", "e,f\n"}), 6); + + // More sophisticated: non-last block ends on some newline inside a quoted string + // (terse reproducer of gh-39857) + AssertParsePartial(parser, Views({"a,b,\"c\n", "\"\n"}), 0); + AssertParsePartial(parser, Views({"a,b,c\n\"d\n", "\",e,f\n"}), 6); +} + TEST(BlockParser, Final) { // Tests for ParseFinal() BlockParser parser(ParseOptions::Defaults()); diff --git a/cpp/src/arrow/csv/reader.cc b/cpp/src/arrow/csv/reader.cc index 332fad054fea3..1ac25e290a814 100644 --- a/cpp/src/arrow/csv/reader.cc +++ b/cpp/src/arrow/csv/reader.cc @@ -261,11 +261,10 @@ class SerialBlockReader : public BlockReader { auto consume_bytes = [this, bytes_before_buffer, next_buffer](int64_t nbytes) -> Status { DCHECK_GE(nbytes, 0); - auto offset = nbytes - bytes_before_buffer; - if (offset < 0) { - // Should not happen - return Status::Invalid("CSV parser got out of sync with chunker"); - } + int64_t offset = nbytes - bytes_before_buffer; + // All data before the buffer should have been consumed. + // This is checked in Parse() and BlockParsingOperator::operator(). + DCHECK_GE(offset, 0); partial_ = SliceBuffer(buffer_, offset); buffer_ = next_buffer; return Status::OK(); @@ -400,6 +399,7 @@ class BlockParsingOperator { count_rows_(first_row >= 0), num_rows_seen_(first_row) {} + // TODO: this is almost entirely the same as ReaderMixin::Parse(). Refactor? Result operator()(const CSVBlock& block) { constexpr int32_t max_num_rows = std::numeric_limits::max(); auto parser = std::make_shared( @@ -427,9 +427,24 @@ class BlockParsingOperator { } else { RETURN_NOT_OK(parser->Parse(views, &parsed_size)); } + + // `partial + completion` should have been entirely consumed. + const int64_t bytes_before_buffer = block.partial->size() + block.completion->size(); + if (static_cast(parsed_size) < bytes_before_buffer) { + // This can happen if `newlines_in_values` is not enabled and + // `partial + completion` ends with a newline inside a quoted string. + // In this case, the BlockParser stops at the truncated data in the first + // block (see gh-39857). + return Status::Invalid( + "CSV parser got out of sync with chunker. This can mean the data file " + "contains cell values spanning multiple lines; please consider enabling " + "the option 'newlines_in_values'."); + } + if (count_rows_) { num_rows_seen_ += parser->total_num_rows(); } + RETURN_NOT_OK(block.consume_bytes(parsed_size)); return ParsedBlock{std::move(parser), block.block_index, static_cast(parsed_size) + block.bytes_skipped}; @@ -738,6 +753,15 @@ class ReaderMixin { } else { RETURN_NOT_OK(parser->Parse(views, &parsed_size)); } + // See BlockParsingOperator for explanation. + const int64_t bytes_before_buffer = partial->size() + completion->size(); + if (static_cast(parsed_size) < bytes_before_buffer) { + return Status::Invalid( + "CSV parser got out of sync with chunker. This can mean the data file " + "contains cell values spanning multiple lines; please consider enabling " + "the option 'newlines_in_values'."); + } + if (count_rows_) { num_rows_seen_ += parser->total_num_rows(); } diff --git a/cpp/src/arrow/device.cc b/cpp/src/arrow/device.cc index 616f89aae896f..3736a4e018c33 100644 --- a/cpp/src/arrow/device.cc +++ b/cpp/src/arrow/device.cc @@ -195,6 +195,13 @@ Result> CPUMemoryManager::ViewBufferFrom( if (!from->is_cpu()) { return nullptr; } + // in this case the memory manager we're coming from is visible on the CPU, + // but uses an allocation type other than CPU. Since we know the data is visible + // to the CPU a "View" of this should use the CPUMemoryManager as the listed memory + // manager. + if (buf->device_type() != DeviceAllocationType::kCPU) { + return std::make_shared(buf->address(), buf->size(), shared_from_this(), buf); + } return buf; } @@ -220,6 +227,13 @@ Result> CPUMemoryManager::ViewBufferTo( if (!to->is_cpu()) { return nullptr; } + // in this case the memory manager we're coming from is visible on the CPU, + // but uses an allocation type other than CPU. Since we know the data is visible + // to the CPU a "View" of this should use the CPUMemoryManager as the listed memory + // manager. + if (buf->device_type() != DeviceAllocationType::kCPU) { + return std::make_shared(buf->address(), buf->size(), to, buf); + } return buf; } diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index bc48ae76c2a2f..002e8b0975094 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -90,7 +90,7 @@ void KeyValueMetadata::Append(std::string key, std::string value) { values_.push_back(std::move(value)); } -Result KeyValueMetadata::Get(const std::string& key) const { +Result KeyValueMetadata::Get(std::string_view key) const { auto index = FindKey(key); if (index < 0) { return Status::KeyError(key); @@ -129,7 +129,7 @@ Status KeyValueMetadata::DeleteMany(std::vector indices) { return Status::OK(); } -Status KeyValueMetadata::Delete(const std::string& key) { +Status KeyValueMetadata::Delete(std::string_view key) { auto index = FindKey(key); if (index < 0) { return Status::KeyError(key); @@ -138,20 +138,18 @@ Status KeyValueMetadata::Delete(const std::string& key) { } } -Status KeyValueMetadata::Set(const std::string& key, const std::string& value) { +Status KeyValueMetadata::Set(std::string key, std::string value) { auto index = FindKey(key); if (index < 0) { - Append(key, value); + Append(std::move(key), std::move(value)); } else { - keys_[index] = key; - values_[index] = value; + keys_[index] = std::move(key); + values_[index] = std::move(value); } return Status::OK(); } -bool KeyValueMetadata::Contains(const std::string& key) const { - return FindKey(key) >= 0; -} +bool KeyValueMetadata::Contains(std::string_view key) const { return FindKey(key) >= 0; } void KeyValueMetadata::reserve(int64_t n) { DCHECK_GE(n, 0); @@ -188,7 +186,7 @@ std::vector> KeyValueMetadata::sorted_pairs( return pairs; } -int KeyValueMetadata::FindKey(const std::string& key) const { +int KeyValueMetadata::FindKey(std::string_view key) const { for (size_t i = 0; i < keys_.size(); ++i) { if (keys_[i] == key) { return static_cast(i); diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index 8702ce73a639a..57ade11e75868 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,13 +45,13 @@ class ARROW_EXPORT KeyValueMetadata { void ToUnorderedMap(std::unordered_map* out) const; void Append(std::string key, std::string value); - Result Get(const std::string& key) const; - bool Contains(const std::string& key) const; + Result Get(std::string_view key) const; + bool Contains(std::string_view key) const; // Note that deleting may invalidate known indices - Status Delete(const std::string& key); + Status Delete(std::string_view key); Status Delete(int64_t index); Status DeleteMany(std::vector indices); - Status Set(const std::string& key, const std::string& value); + Status Set(std::string key, std::string value); void reserve(int64_t n); @@ -63,7 +64,7 @@ class ARROW_EXPORT KeyValueMetadata { std::vector> sorted_pairs() const; /// \brief Perform linear search for key, returning -1 if not found - int FindKey(const std::string& key) const; + int FindKey(std::string_view key) const; std::shared_ptr Copy() const; diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 3f038f54a7b27..d773fb5ff5895 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -229,6 +229,15 @@ function(ADD_GANDIVA_TEST REL_TEST_NAME) set(TEST_NAME gandiva-${REL_TEST_NAME}) string(REPLACE "_" "-" TEST_NAME ${TEST_NAME}) + + if(ARG_USE_STATIC_LINKING OR ARROW_TEST_LINKAGE STREQUAL "static") + # LLVM 17 or later requires that an executable exports + # "llvm_orc_registerEHFrameSectionWrapper()" and + # "llvm_orc_unregisterEHFrameSectionWrapper()". We need to do + # nothing when we use libLLVM.so. But we need to export symbols + # explicitly when we use libLLVM*.a. + set_target_properties(${TEST_NAME} PROPERTIES ENABLE_EXPORTS TRUE) + endif() endfunction() add_gandiva_test(internals-test diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index fc047f2ac0763..bfce72cefc630 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -62,7 +62,11 @@ #endif #include #include +#if LLVM_VERSION_MAJOR >= 18 +#include +#else #include +#endif #include #include #if LLVM_VERSION_MAJOR >= 14 @@ -86,7 +90,9 @@ #include #include #include +#if LLVM_VERSION_MAJOR <= 17 #include +#endif // JITLink is available in LLVM 9+ // but the `InProcessMemoryManager::Create` API was added since LLVM 14 @@ -132,8 +138,13 @@ Result MakeTargetMachineBuilder( jtmb.setCPU(cpu_name.str()); jtmb.addFeatures(cpu_attrs); } +#if LLVM_VERSION_MAJOR >= 18 + using CodeGenOptLevel = llvm::CodeGenOptLevel; +#else + using CodeGenOptLevel = llvm::CodeGenOpt::Level; +#endif auto const opt_level = - conf.optimize() ? llvm::CodeGenOpt::Aggressive : llvm::CodeGenOpt::None; + conf.optimize() ? CodeGenOptLevel::Aggressive : CodeGenOptLevel::None; jtmb.setCodeGenOptLevel(opt_level); return jtmb; } diff --git a/cpp/src/parquet/arrow/schema_internal.h b/cpp/src/parquet/arrow/schema_internal.h index 55292ac35ab9c..f56ba0958ae2d 100644 --- a/cpp/src/parquet/arrow/schema_internal.h +++ b/cpp/src/parquet/arrow/schema_internal.h @@ -34,10 +34,6 @@ Result> FromFLBA(const LogicalType& logical_t Result> FromInt32(const LogicalType& logical_type); Result> FromInt64(const LogicalType& logical_type); -Result> GetArrowType(Type::type physical_type, - const LogicalType& logical_type, - int type_length); - Result> GetArrowType( Type::type physical_type, const LogicalType& logical_type, int type_length, ::arrow::TimeUnit::type int96_arrow_time_unit = ::arrow::TimeUnit::NANO); diff --git a/cpp/src/parquet/column_reader_benchmark.cc b/cpp/src/parquet/column_reader_benchmark.cc index 49b2317ede187..61fe397cf1c30 100644 --- a/cpp/src/parquet/column_reader_benchmark.cc +++ b/cpp/src/parquet/column_reader_benchmark.cc @@ -219,5 +219,103 @@ BENCHMARK(RecordReaderReadRecords) ->Args({2, 1000, true}) ->Args({2, 1000, false}); +void GenerateLevels(int level_repeats, int max_level, int num_levels, + std::vector* levels) { + // Generate random levels + std::default_random_engine gen(/*seed=*/1943); + std::uniform_int_distribution d(0, max_level); + for (int i = 0; i < num_levels;) { + int16_t current_level = d(gen); // level repeat `level_repeats` times + const int current_repeated = std::min(level_repeats, num_levels - i); + levels->insert(levels->end(), current_repeated, current_level); + i += current_repeated; + } +} + +void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, + const int16_t* input_levels, std::vector* bytes) { + LevelEncoder encoder; + // encode levels + if (encoding == Encoding::RLE) { + int rle_size = LevelEncoder::MaxBufferSize(encoding, max_level, num_levels); + bytes->resize(rle_size + sizeof(int32_t)); + // leave space to write the rle length value + encoder.Init(encoding, max_level, num_levels, bytes->data() + sizeof(int32_t), + rle_size); + encoder.Encode(num_levels, input_levels); + int data_length = encoder.len(); + memcpy(bytes->data(), &data_length, sizeof(int32_t)); + } else { + int bitpack_size = + LevelEncoder::MaxBufferSize(encoding, max_level, num_levels) + sizeof(int32_t); + bytes->resize(bitpack_size); + encoder.Init(encoding, max_level, num_levels, bytes->data(), + static_cast(bytes->size())); + encoder.Encode(num_levels, input_levels); + } +} + +static void DecodeLevels(Encoding::type level_encoding, int16_t max_level, int num_levels, + int batch_size, int level_repeat_count, + ::benchmark::State& state) { + std::vector bytes; + { + std::vector input_levels; + GenerateLevels(/*level_repeats=*/level_repeat_count, /*max_repeat_factor=*/max_level, + num_levels, &input_levels); + EncodeLevels(level_encoding, max_level, num_levels, input_levels.data(), &bytes); + } + + LevelDecoder decoder; + std::vector output_levels(batch_size); + for (auto _ : state) { + state.PauseTiming(); + decoder.SetData(level_encoding, max_level, num_levels, bytes.data(), + static_cast(bytes.size())); + state.ResumeTiming(); + // Decode multiple times with batch_size + while (true) { + int levels_decoded = decoder.Decode(batch_size, output_levels.data()); + if (levels_decoded == 0) { + break; + } + } + } + state.SetBytesProcessed(state.iterations() * num_levels * sizeof(int16_t)); + state.SetItemsProcessed(state.iterations() * num_levels); +} + +static void ReadLevels_Rle(::benchmark::State& state) { + int16_t max_level = static_cast(state.range(0)); + int num_levels = static_cast(state.range(1)); + int batch_size = static_cast(state.range(2)); + int level_repeat_count = static_cast(state.range(3)); + DecodeLevels(Encoding::RLE, max_level, num_levels, batch_size, level_repeat_count, + state); +} + +static void ReadLevels_BitPack(::benchmark::State& state) { + int16_t max_level = static_cast(state.range(0)); + int num_levels = static_cast(state.range(1)); + int batch_size = static_cast(state.range(2)); + int level_repeat_count = static_cast(state.range(3)); + DecodeLevels(Encoding::BIT_PACKED, max_level, num_levels, batch_size, + level_repeat_count, state); +} + +static void ReadLevelsArguments(::benchmark::internal::Benchmark* b) { + b->ArgNames({"MaxLevel", "NumLevels", "BatchSize", "LevelRepeatCount"}) + ->Args({1, 8096, 1024, 1}) + ->Args({1, 8096, 1024, 7}) + ->Args({1, 8096, 1024, 1024}) + ->Args({1, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 1}) + ->Args({3, 8096, 2048, 1}) + ->Args({3, 8096, 1024, 7}); +} + +BENCHMARK(ReadLevels_Rle)->Apply(ReadLevelsArguments); +BENCHMARK(ReadLevels_BitPack)->Apply(ReadLevelsArguments); + } // namespace benchmark } // namespace parquet diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 97421629d2ca6..a40e71ce30aec 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1021,7 +1021,7 @@ void EncodeLevels(Encoding::type encoding, int16_t max_level, int num_levels, } void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector& bytes) { LevelDecoder decoder; int levels_count = 0; @@ -1060,7 +1060,7 @@ void VerifyDecodingLevels(Encoding::type encoding, int16_t max_level, } void VerifyDecodingMultipleSetData(Encoding::type encoding, int16_t max_level, - std::vector& input_levels, + const std::vector& input_levels, std::vector>& bytes) { LevelDecoder decoder; int levels_count = 0; diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 68c3e47e01902..3a6ae28b390d2 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh index f18f7d10c73e6..4df574700e812 100755 --- a/dev/release/post-08-docs.sh +++ b/dev/release/post-08-docs.sh @@ -86,6 +86,21 @@ if [ "$is_major_release" = "yes" ] ; then fi git add docs git commit -m "[Website] Update documentations for ${version}" + +# Update DOCUMENTATION_OPTIONS.theme_switcher_version_match and +# DOCUMENTATION_OPTIONS.show_version_warning_banner +pushd docs/${previous_series} +find ./ \ + -type f \ + -exec \ + sed -i.bak \ + -e "s/DOCUMENTATION_OPTIONS.theme_switcher_version_match = '';/DOCUMENTATION_OPTIONS.theme_switcher_version_match = '${previous_version}';/g" \ + -e "s/DOCUMENTATION_OPTIONS.show_version_warning_banner = false/DOCUMENTATION_OPTIONS.show_version_warning_banner = true/g" \ + {} \; +find ./ -name '*.bak' -delete +popd +git add docs/${previous_series} +git commit -m "[Website] Update warning banner for ${previous_series}" git clean -d -f -x popd diff --git a/dev/tasks/python-wheels/github.windows.yml b/dev/tasks/python-wheels/github.windows.yml index 1641796a719e2..01f4977a9b0b1 100644 --- a/dev/tasks/python-wheels/github.windows.yml +++ b/dev/tasks/python-wheels/github.windows.yml @@ -29,7 +29,7 @@ jobs: # this is a private repository at the moment (mostly because of licensing # consideration of windows images with visual studio), but anyone can # recreate the image by manually building it via: - # `archery build python-wheel-windows-vs2017` + # `archery build python-wheel-windows-vs2019` # note that we don't run docker build since there wouldn't be a cache hit # and rebuilding the dependencies takes a fair amount of time REPO: ghcr.io/ursacomputing/arrow @@ -46,17 +46,17 @@ jobs: run: | cd arrow @rem We want to use only - @rem archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2017 + @rem archery docker run -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019 @rem but it doesn't use pulled caches. @rem It always build an image from scratch. @rem We can remove this workaround once we find a way to use @rem pulled caches when build an image. echo on - archery docker pull --no-ignore-pull-failures python-wheel-windows-vs2017 + archery docker pull --no-ignore-pull-failures python-wheel-windows-vs2019 if errorlevel 1 ( - archery docker build --no-pull python-wheel-windows-vs2017 || exit /B 1 + archery docker build --no-pull python-wheel-windows-vs2019 || exit /B 1 ) - archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2017 + archery docker run --no-build -e SETUPTOOLS_SCM_PRETEND_VERSION={{ arrow.no_rc_version }} python-wheel-windows-vs2019 - uses: actions/upload-artifact@v3 with: @@ -77,5 +77,5 @@ jobs: shell: cmd run: | cd arrow - archery docker push python-wheel-windows-vs2017 + archery docker push python-wheel-windows-vs2019 {% endif %} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 0f8c58391fa66..cf04d29715306 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -748,6 +748,10 @@ tasks: - arrow-jdbc-{no_rc_snapshot_version}-tests.jar - arrow-jdbc-{no_rc_snapshot_version}.jar - arrow-jdbc-{no_rc_snapshot_version}.pom + - arrow-maven-plugins-{no_rc_snapshot_version}-cyclonedx.json + - arrow-maven-plugins-{no_rc_snapshot_version}-cyclonedx.xml + - arrow-maven-plugins-{no_rc_snapshot_version}-src.zip + - arrow-maven-plugins-{no_rc_snapshot_version}.pom - arrow-memory-core-{no_rc_snapshot_version}-cyclonedx.json - arrow-memory-core-{no_rc_snapshot_version}-cyclonedx.xml - arrow-memory-core-{no_rc_snapshot_version}-javadoc.jar @@ -762,6 +766,13 @@ tasks: - arrow-memory-netty-{no_rc_snapshot_version}-tests.jar - arrow-memory-netty-{no_rc_snapshot_version}.jar - arrow-memory-netty-{no_rc_snapshot_version}.pom + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-cyclonedx.json + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-cyclonedx.xml + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-javadoc.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-sources.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}-tests.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}.jar + - arrow-memory-netty-buffer-patch-{no_rc_snapshot_version}.pom - arrow-memory-unsafe-{no_rc_snapshot_version}-cyclonedx.json - arrow-memory-unsafe-{no_rc_snapshot_version}-cyclonedx.xml - arrow-memory-unsafe-{no_rc_snapshot_version}-javadoc.jar @@ -839,6 +850,13 @@ tasks: - flight-sql-jdbc-driver-{no_rc_snapshot_version}-tests.jar - flight-sql-jdbc-driver-{no_rc_snapshot_version}.jar - flight-sql-jdbc-driver-{no_rc_snapshot_version}.pom + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-cyclonedx.json + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-cyclonedx.xml + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-javadoc.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-sources.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}-src.zip + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}.jar + - module-info-compiler-maven-plugin-{no_rc_snapshot_version}.pom ############################## NuGet packages ############################### diff --git a/docker-compose.yml b/docker-compose.yml index 0252c4ec8a896..8a7223b57632f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -172,7 +172,7 @@ x-hierarchy: - python-wheel-manylinux-2-28 - python-wheel-manylinux-test-imports - python-wheel-manylinux-test-unittests - - python-wheel-windows-vs2017 + - python-wheel-windows-vs2019 - python-wheel-windows-test volumes: @@ -1030,7 +1030,7 @@ services: args: arch: ${ARCH} arch_short: ${ARCH_SHORT} - base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2023-10-03-72cdc42 + base: quay.io/pypa/manylinux2014_${ARCH_ALIAS}:2024-02-04-ea37246 vcpkg: ${VCPKG} python: ${PYTHON} manylinux: 2014 @@ -1053,7 +1053,7 @@ services: args: arch: ${ARCH} arch_short: ${ARCH_SHORT} - base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2023-10-03-72cdc42 + base: quay.io/pypa/manylinux_2_28_${ARCH_ALIAS}:2024-02-04-ea37246 vcpkg: ${VCPKG} python: ${PYTHON} manylinux: 2_28 @@ -1098,19 +1098,19 @@ services: CHECK_UNITTESTS: "ON" command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow - python-wheel-windows-vs2017: - image: ${REPO}:python-${PYTHON}-wheel-windows-vs2017-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + python-wheel-windows-vs2019: + image: ${REPO}:python-${PYTHON}-wheel-windows-vs2019-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} build: args: vcpkg: ${VCPKG} python: ${PYTHON} context: . - dockerfile: ci/docker/python-wheel-windows-vs2017.dockerfile + dockerfile: ci/docker/python-wheel-windows-vs2019.dockerfile # This should make the pushed images reusable, but the image gets rebuilt. # Uncomment if no local cache is available. # cache_from: - # - abrarov/msvc-2017:2.11.0 - # - ${REPO}:python-${PYTHON}-wheel-windows-vs2017-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + # - abrarov/msvc-2019:2.11.0 + # - ${REPO}:python-${PYTHON}-wheel-windows-vs2019-vcpkg-${VCPKG}-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} volumes: - "${DOCKER_VOLUME_PREFIX}python-wheel-windows-clcache:C:/clcache" - type: bind @@ -1119,12 +1119,12 @@ services: command: arrow\\ci\\scripts\\python_wheel_windows_build.bat python-wheel-windows-test: - image: ${REPO}:python-${PYTHON}-wheel-windows-test-vs2017-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} + image: ${REPO}:python-${PYTHON}-wheel-windows-test-vs2019-${PYTHON_WHEEL_WINDOWS_IMAGE_REVISION} build: args: python: ${PYTHON} context: . - dockerfile: ci/docker/python-wheel-windows-test-vs2017.dockerfile + dockerfile: ci/docker/python-wheel-windows-test-vs2019.dockerfile volumes: - "${DOCKER_VOLUME_PREFIX}python-wheel-windows-clcache:C:/clcache" - type: bind diff --git a/go/parquet/file/column_writer.go b/go/parquet/file/column_writer.go index ac857d17e632d..4d603c547ca6a 100755 --- a/go/parquet/file/column_writer.go +++ b/go/parquet/file/column_writer.go @@ -397,7 +397,6 @@ func (w *columnWriter) FlushBufferedDataPages() (err error) { } } w.pages = w.pages[:0] - w.totalCompressedBytes = 0 return } @@ -542,7 +541,9 @@ func (w *columnWriter) Close() (err error) { if !w.closed { w.closed = true if w.hasDict && !w.fallbackToNonDict { - w.WriteDictionaryPage() + if err = w.WriteDictionaryPage(); err != nil { + return err + } } if err = w.FlushBufferedDataPages(); err != nil { @@ -659,7 +660,10 @@ func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int if values.Data().Offset() > 0 { data := values.Data() - buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes]) + elemSize := data.DataType().(arrow.FixedWidthDataType).Bytes() + start := data.Offset() * elemSize + end := start + data.Len()*elemSize + buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[start:end]) } data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0) diff --git a/go/parquet/file/column_writer_test.go b/go/parquet/file/column_writer_test.go index 8011ac2487995..dd597e280b850 100755 --- a/go/parquet/file/column_writer_test.go +++ b/go/parquet/file/column_writer_test.go @@ -24,6 +24,8 @@ import ( "sync" "testing" + "github.com/apache/arrow/go/v16/arrow" + "github.com/apache/arrow/go/v16/arrow/array" "github.com/apache/arrow/go/v16/arrow/bitutil" "github.com/apache/arrow/go/v16/arrow/memory" arrutils "github.com/apache/arrow/go/v16/internal/utils" @@ -36,6 +38,7 @@ import ( "github.com/apache/arrow/go/v16/parquet/internal/testutils" "github.com/apache/arrow/go/v16/parquet/internal/utils" "github.com/apache/arrow/go/v16/parquet/metadata" + "github.com/apache/arrow/go/v16/parquet/pqarrow" "github.com/apache/arrow/go/v16/parquet/schema" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -426,6 +429,26 @@ func (p *PrimitiveWriterTestSuite) testDictionaryFallbackEncoding(version parque } } +func (p *PrimitiveWriterTestSuite) testDictionaryFallbackAndCompressedSize(version parquet.Version) { + p.GenerateData(SmallSize) + props := parquet.DefaultColumnProperties() + props.DictionaryEnabled = true + + if version == parquet.V1_0 { + props.Encoding = parquet.Encodings.PlainDict + } else { + props.Encoding = parquet.Encodings.RLEDict + } + + writer := p.buildWriter(SmallSize, props, parquet.WithVersion(version)) + p.WriteBatchValues(writer, nil, nil) + writer.FallbackToPlain() + p.NotEqual(0, writer.TotalCompressedBytes()) + writer.Close() + p.NotEqual(0, writer.TotalCompressedBytes()) + p.NotEqual(0, writer.TotalBytesWritten()) +} + func (p *PrimitiveWriterTestSuite) TestRequiredPlain() { p.testRequiredWithEncoding(parquet.Encodings.Plain) } @@ -575,6 +598,14 @@ func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackEncodingV2() { p.testDictionaryFallbackEncoding(parquet.V2_LATEST) } +func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV1() { + p.testDictionaryFallbackAndCompressedSize(parquet.V1_0) +} + +func (p *PrimitiveWriterTestSuite) TestDictionaryFallbackStatsV2() { + p.testDictionaryFallbackAndCompressedSize(parquet.V2_LATEST) +} + func (p *PrimitiveWriterTestSuite) TestOptionalNullValueChunk() { // test case for NULL values p.SetupSchema(parquet.Repetitions.Optional, 1) @@ -708,3 +739,38 @@ func (b *BooleanValueWriterSuite) TestAlternateBooleanValues() { b.Equal(i%2 == 0, b.ValuesOut.([]bool)[i]) } } + +func TestDictionaryReslice(t *testing.T) { + pts := []arrow.DataType{ + arrow.PrimitiveTypes.Int8, + arrow.PrimitiveTypes.Int16, + arrow.PrimitiveTypes.Int32, + arrow.PrimitiveTypes.Int64, + arrow.PrimitiveTypes.Uint8, + arrow.PrimitiveTypes.Uint16, + arrow.PrimitiveTypes.Uint32, + arrow.PrimitiveTypes.Uint64, + } + for _, pt := range pts { + t.Run(pt.String(), func(t *testing.T) { + mem := memory.NewGoAllocator() + dt := &arrow.DictionaryType{ + IndexType: pt, + ValueType: &arrow.StringType{}, + } + field := arrow.Field{Name: "test_field", Type: dt, Nullable: true} + schema := arrow.NewSchema([]arrow.Field{field}, nil) + b := array.NewRecordBuilder(mem, schema) + for i := 0; i < 2000; i++ { + b.Field(0).(*array.BinaryDictionaryBuilder).AppendString("test_value") + } + rec := b.NewRecord() + out := &bytes.Buffer{} + pqw, err := pqarrow.NewFileWriter(rec.Schema(), out, nil, pqarrow.NewArrowWriterProperties()) + assert.NoError(t, err) + err = pqw.WriteBuffered(rec) + assert.NoError(t, err) + + }) + } +} diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java index 5b01077b17996..f95133fc7e44c 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/Constants.java @@ -21,7 +21,8 @@ * String constants used for metadata returned on Vectors. */ public class Constants { - private Constants() {} + private Constants() { + } public static final String SQL_CATALOG_NAME_KEY = "SQL_CATALOG_NAME"; public static final String SQL_SCHEMA_NAME_KEY = "SQL_SCHEMA_NAME"; diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java index 438a949b736f1..4478cdfbee6f7 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/MockPreparedStatement.java @@ -231,7 +231,8 @@ public void setDate(int parameterIndex, Date x, Calendar cal) throws SQLExceptio } @Override - public void setTime(int parameterIndex, Time x, Calendar cal) throws SQLException {} + public void setTime(int parameterIndex, Time x, Calendar cal) throws SQLException { + } @Override public void setTimestamp(int parameterIndex, Timestamp x, Calendar cal) throws SQLException { @@ -241,7 +242,8 @@ public void setTimestamp(int parameterIndex, Timestamp x, Calendar cal) throws S } @Override - public void setNull(int parameterIndex, int sqlType, String typeName) throws SQLException {} + public void setNull(int parameterIndex, int sqlType, String typeName) throws SQLException { + } @Override public void setURL(int parameterIndex, URL x) throws SQLException { @@ -259,62 +261,80 @@ public void setRowId(int parameterIndex, RowId x) throws SQLException { } @Override - public void setNString(int parameterIndex, String value) throws SQLException {} + public void setNString(int parameterIndex, String value) throws SQLException { + } @Override public void setNCharacterStream(int parameterIndex, Reader value, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setNClob(int parameterIndex, NClob value) throws SQLException {} + public void setNClob(int parameterIndex, NClob value) throws SQLException { + } @Override - public void setClob(int parameterIndex, Reader reader, long length) throws SQLException {} + public void setClob(int parameterIndex, Reader reader, long length) throws SQLException { + } @Override public void setBlob(int parameterIndex, InputStream inputStream, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setNClob(int parameterIndex, Reader reader, long length) throws SQLException {} + public void setNClob(int parameterIndex, Reader reader, long length) throws SQLException { + } @Override - public void setSQLXML(int parameterIndex, SQLXML xmlObject) throws SQLException {} + public void setSQLXML(int parameterIndex, SQLXML xmlObject) throws SQLException { + } @Override public void setObject(int parameterIndex, Object x, int targetSqlType, int scaleOrLength) - throws SQLException {} + throws SQLException { + } @Override - public void setAsciiStream(int parameterIndex, InputStream x, long length) throws SQLException {} + public void setAsciiStream(int parameterIndex, InputStream x, long length) throws SQLException { + } @Override - public void setBinaryStream(int parameterIndex, InputStream x, long length) throws SQLException {} + public void setBinaryStream(int parameterIndex, InputStream x, long length) throws SQLException { + } @Override public void setCharacterStream(int parameterIndex, Reader reader, long length) - throws SQLException {} + throws SQLException { + } @Override - public void setAsciiStream(int parameterIndex, InputStream x) throws SQLException {} + public void setAsciiStream(int parameterIndex, InputStream x) throws SQLException { + } @Override - public void setBinaryStream(int parameterIndex, InputStream x) throws SQLException {} + public void setBinaryStream(int parameterIndex, InputStream x) throws SQLException { + } @Override - public void setCharacterStream(int parameterIndex, Reader reader) throws SQLException {} + public void setCharacterStream(int parameterIndex, Reader reader) throws SQLException { + } @Override - public void setNCharacterStream(int parameterIndex, Reader value) throws SQLException {} + public void setNCharacterStream(int parameterIndex, Reader value) throws SQLException { + } @Override - public void setClob(int parameterIndex, Reader reader) throws SQLException {} + public void setClob(int parameterIndex, Reader reader) throws SQLException { + } @Override - public void setBlob(int parameterIndex, InputStream inputStream) throws SQLException {} + public void setBlob(int parameterIndex, InputStream inputStream) throws SQLException { + } @Override - public void setNClob(int parameterIndex, Reader reader) throws SQLException {} + public void setNClob(int parameterIndex, Reader reader) throws SQLException { + } @Override public ResultSet executeQuery(String sql) throws SQLException { @@ -327,7 +347,8 @@ public int executeUpdate(String sql) throws SQLException { } @Override - public void close() throws SQLException {} + public void close() throws SQLException { + } @Override public int getMaxFieldSize() throws SQLException { diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java index c712741b51f5b..ccc7681c5bc8b 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/ResultSetUtility.java @@ -348,7 +348,8 @@ public static class MockColumnMetaData { private int displaySize; - private MockColumnMetaData() {} + private MockColumnMetaData() { + } private String getLabel() { return label; diff --git a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java index 9b599234bdf51..d61799e990f77 100644 --- a/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java +++ b/java/adapter/orc/src/main/java/org/apache/arrow/adapter/orc/OrcJniUtils.java @@ -32,7 +32,8 @@ class OrcJniUtils { private static final String LIBRARY_NAME = "arrow_orc_jni"; private static boolean isLoaded = false; - private OrcJniUtils() {} + private OrcJniUtils() { + } static void loadOrcAdapterLibraryFromJar() throws IOException, IllegalAccessException { diff --git a/java/dev/checkstyle/checkstyle.xml b/java/dev/checkstyle/checkstyle.xml index c27f382ddda76..b63a4a9cba1f3 100644 --- a/java/dev/checkstyle/checkstyle.xml +++ b/java/dev/checkstyle/checkstyle.xml @@ -60,6 +60,11 @@ + + + + + @@ -72,10 +77,6 @@ - - - - @@ -223,13 +224,12 @@ - - - - - + + + + diff --git a/java/dev/checkstyle/suppressions.xml b/java/dev/checkstyle/suppressions.xml index 585985bf32dbc..a3536e2ca9212 100644 --- a/java/dev/checkstyle/suppressions.xml +++ b/java/dev/checkstyle/suppressions.xml @@ -40,5 +40,5 @@ - + diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java index fc491ebe0df98..8f251a7c7ef07 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightClient.java @@ -437,7 +437,8 @@ public ClientStreamListener getWriter() { */ public void getResult() { // After exchange is complete, make sure stream is drained to propagate errors through reader - while (reader.next()) { }; + while (reader.next()) { + } } /** Shut down the streams in this call. */ diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java index eb5e492b4cd46..b711d7ef6b5d7 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightGrpcUtils.java @@ -125,7 +125,8 @@ public void enterIdle() { } } - private FlightGrpcUtils() {} + private FlightGrpcUtils() { + } /** * Creates a Flight service. diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java index 7a5a941603ace..84beee7d40564 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStream.java @@ -194,7 +194,8 @@ public void close() throws Exception { } } // Drain the stream without the lock (as next() implicitly needs the lock) - while (next()) { } + while (next()) { + } } catch (FlightRuntimeException e) { suppressor = e; } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java index e80fb41c67273..80ddad90a1d28 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/OutboundStreamListener.java @@ -119,5 +119,6 @@ default void start(VectorSchemaRoot root, DictionaryProvider dictionaries) { *

The default value can be toggled globally by setting the JVM property arrow.flight.enable_zero_copy_write * or the environment variable ARROW_FLIGHT_ENABLE_ZERO_COPY_WRITE. */ - default void setUseZeroCopy(boolean enabled) {} + default void setUseZeroCopy(boolean enabled) { + } } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java index e3ccdc626d71b..8a37115f1f024 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/AuthConstants.java @@ -47,5 +47,6 @@ public byte[] parseBytes(byte[] serialized) { public static final Context.Key PEER_IDENTITY_KEY = Context.keyWithDefault("arrow-flight-peer-identity", ""); - private AuthConstants() {} + private AuthConstants() { + } } diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java index ad1a36a935fd7..3647e113cc0f6 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/auth/ServerAuthWrapper.java @@ -115,7 +115,9 @@ public boolean hasNext() { @Override public void onError(Throwable t) { completed = true; - while (future == null) {/* busy wait */} + while (future == null) { + /* busy wait */ + } future.cancel(true); } diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java index bcff54bd7f66f..a1fa1f1d18509 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/TestClientMiddleware.java @@ -303,10 +303,12 @@ public void onBeforeSendingHeaders(CallHeaders outgoingHeaders) { } @Override - public void onCallCompleted(CallStatus status) {} + public void onCallCompleted(CallStatus status) { + } @Override - public void onCallErrored(Throwable err) {} + public void onCallErrored(Throwable err) { + } } static class MultiHeaderClientMiddlewareFactory implements FlightClientMiddleware.Factory { @@ -356,6 +358,7 @@ public void onHeadersReceived(CallHeaders incomingHeaders) { } @Override - public void onCallCompleted(CallStatus status) {} + public void onCallCompleted(CallStatus status) { + } } } diff --git a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java index b8aa46fb5674a..13238f318eaaa 100644 --- a/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java +++ b/java/flight/flight-integration-tests/src/main/java/org/apache/arrow/flight/integration/tests/OrderedScenario.java @@ -55,7 +55,8 @@ public FlightProducer producer(BufferAllocator allocator, Location location) thr } @Override - public void buildServer(FlightServer.Builder builder) throws Exception {} + public void buildServer(FlightServer.Builder builder) throws Exception { + } @Override public void client(BufferAllocator allocator, Location location, FlightClient client) diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java index fdf6c508d93b0..de6dccad4a846 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/IntervalStringUtils.java @@ -31,7 +31,8 @@ public final class IntervalStringUtils { /** * Constructor Method of class. */ - private IntervalStringUtils( ) {} + private IntervalStringUtils( ) { + } /** * Formats a period similar to Oracle INTERVAL YEAR TO MONTH data type
. diff --git a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java index b7977462e9c01..78d252f7824c3 100644 --- a/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java +++ b/java/flight/flight-sql-jdbc-core/src/test/java/org/apache/arrow/driver/jdbc/client/utils/ClientAuthenticationUtilsTest.java @@ -84,7 +84,7 @@ public void testGetDefaultKeyStoreInstancePassword() throws IOException, keyStoreMockedStatic .when(() -> ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit")) - .thenReturn(keyStoreMock); + .thenReturn(keyStoreMock); KeyStore receiveKeyStore = ClientAuthenticationUtils.getDefaultKeyStoreInstance("changeit"); Assert.assertEquals(receiveKeyStore, keyStoreMock); } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java index e903b4e873278..fa5d285b90997 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ConfigurationBuilder.java @@ -43,7 +43,8 @@ public static ConfigOptions getDefault() { return new ConfigOptions(); } - public ConfigOptions() {} + public ConfigOptions() { + } public ConfigOptions withOptimize(boolean optimize) { this.optimize = optimize; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java index e0c072cfbe52e..703cfaa8be88b 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/DecimalTypeUtil.java @@ -23,7 +23,8 @@ * Utility methods for working with {@link Decimal} values. */ public class DecimalTypeUtil { - private DecimalTypeUtil() {} + private DecimalTypeUtil() { + } /** * Enum for supported mathematical operations. diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 90f8684b455a8..e7377cc5c9db4 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -33,7 +33,8 @@ * Utility methods to convert between Arrow and Gandiva types. */ public class ArrowTypeHelper { - private ArrowTypeHelper() {} + private ArrowTypeHelper() { + } static final int WIDTH_8 = 8; static final int WIDTH_16 = 16; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java index 8656e886aae24..3d2ea27d044e7 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/TreeBuilder.java @@ -29,7 +29,8 @@ * Contains helper functions for constructing expression trees. */ public class TreeBuilder { - private TreeBuilder() {} + private TreeBuilder() { + } /** * Helper functions to create literal constants. diff --git a/java/maven/pom.xml b/java/maven/pom.xml index 3a88ec762e19c..c2b13119fc440 100644 --- a/java/maven/pom.xml +++ b/java/maven/pom.xml @@ -235,7 +235,7 @@ com.puppycrawl.tools checkstyle - 8.19 + 8.29 org.slf4j @@ -271,7 +271,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.10 + 2.7.11 package @@ -333,7 +333,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java index ff2b25dfa30ab..b8de6d819eaf8 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/AllocationListener.java @@ -34,7 +34,8 @@ public interface AllocationListener { * * @param size the buffer size being allocated */ - default void onPreAllocation(long size) {} + default void onPreAllocation(long size) { + } /** * Called each time a new buffer has been allocated. @@ -43,7 +44,8 @@ default void onPreAllocation(long size) {} * * @param size the buffer size being allocated */ - default void onAllocation(long size) {} + default void onAllocation(long size) { + } /** * Informed each time a buffer is released from allocation. @@ -51,7 +53,8 @@ default void onAllocation(long size) {} *

An exception cannot be thrown by this method. * @param size The size of the buffer being released. */ - default void onRelease(long size) {} + default void onRelease(long size) { + } /** @@ -73,7 +76,8 @@ default boolean onFailedAllocation(long size, AllocationOutcome outcome) { * @param parentAllocator The parent allocator to which a child was added * @param childAllocator The child allocator that was just added */ - default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator childAllocator) {} + default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator childAllocator) { + } /** * Called immediately after a child allocator was removed from the parent allocator. @@ -81,5 +85,6 @@ default void onChildAdded(BufferAllocator parentAllocator, BufferAllocator child * @param parentAllocator The parent allocator from which a child was removed * @param childAllocator The child allocator that was just removed */ - default void onChildRemoved(BufferAllocator parentAllocator, BufferAllocator childAllocator) {} + default void onChildRemoved(BufferAllocator parentAllocator, BufferAllocator childAllocator) { + } } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java index 8779c7a3434ea..189c800ba0fe5 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -702,18 +702,18 @@ private void verifyAllocator( void print(StringBuilder sb, int level, Verbosity verbosity) { CommonUtil.indent(sb, level) - .append("Allocator(") - .append(name) - .append(") ") - .append(reservation) - .append('/') - .append(getAllocatedMemory()) - .append('/') - .append(getPeakMemoryAllocation()) - .append('/') - .append(getLimit()) - .append(" (res/actual/peak/limit)") - .append('\n'); + .append("Allocator(") + .append(name) + .append(") ") + .append(reservation) + .append('/') + .append(getAllocatedMemory()) + .append('/') + .append(getPeakMemoryAllocation()) + .append('/') + .append(getLimit()) + .append(" (res/actual/peak/limit)") + .append('\n'); if (DEBUG) { CommonUtil.indent(sb, level + 1).append(String.format("child allocators: %d\n", childAllocators.size())); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java index 1ca3e08ecf046..62d268a1f4493 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/BufferLedger.java @@ -478,20 +478,20 @@ public long getAccountedSize() { */ void print(StringBuilder sb, int indent, BaseAllocator.Verbosity verbosity) { CommonUtil.indent(sb, indent) - .append("ledger[") - .append(ledgerId) - .append("] allocator: ") - .append(allocator.getName()) - .append("), isOwning: ") - .append(", size: ") - .append(", references: ") - .append(bufRefCnt.get()) - .append(", life: ") - .append(lCreationTime) - .append("..") - .append(lDestructionTime) - .append(", allocatorManager: [") - .append(", life: "); + .append("ledger[") + .append(ledgerId) + .append("] allocator: ") + .append(allocator.getName()) + .append("), isOwning: ") + .append(", size: ") + .append(", references: ") + .append(bufRefCnt.get()) + .append(", life: ") + .append(lCreationTime) + .append("..") + .append(lDestructionTime) + .append(", allocatorManager: [") + .append(", life: "); if (!BaseAllocator.DEBUG) { sb.append("]\n"); @@ -499,8 +499,8 @@ void print(StringBuilder sb, int indent, BaseAllocator.Verbosity verbosity) { Preconditions.checkArgument(buffers != null, "IdentityHashMap of buffers must not be null"); synchronized (buffers) { sb.append("] holds ") - .append(buffers.size()) - .append(" buffers. \n"); + .append(buffers.size()) + .append(" buffers. \n"); for (ArrowBuf buf : buffers.keySet()) { buf.print(sb, indent + 2, verbosity); sb.append('\n'); diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java index 7d4de18751ba9..64a4232d8aeb7 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReferenceManager.java @@ -141,10 +141,12 @@ public boolean release(int decrement) { } @Override - public void retain() { } + public void retain() { + } @Override - public void retain(int increment) { } + public void retain(int increment) { + } @Override public ArrowBuf retain(ArrowBuf srcBuffer, BufferAllocator targetAllocator) { diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java index 9579245ca7004..79d21fa040876 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java @@ -32,7 +32,8 @@ public class ByteFunctionHelpers { private static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; - private ByteFunctionHelpers() {} + private ByteFunctionHelpers() { + } /** * Helper function to check for equality of bytes in two ArrowBufs. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java index ccca7b1e03093..707c5f1556062 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/CommonUtil.java @@ -24,7 +24,8 @@ */ public final class CommonUtil { - private CommonUtil() { } + private CommonUtil() { + } /** * Rounds up the provided value to the nearest power of two. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java index db63bbd14ba5f..94a7873664216 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/LargeMemoryUtil.java @@ -22,7 +22,8 @@ /** Contains utilities for dealing with a 64-bit address base. */ public final class LargeMemoryUtil { - private LargeMemoryUtil() {} + private LargeMemoryUtil() { + } /** * Casts length to an int, but raises an exception the value is outside diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java index 6b01a61ebca39..b88372abaaee1 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Collections2.java @@ -34,7 +34,8 @@ * Utility methods for manipulating {@link java.util.Collections} and their subclasses/implementations. */ public final class Collections2 { - private Collections2() {} + private Collections2() { + } /** * Creates a {@link List} from the elements remaining in iterator. diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java index 8083033007d9c..5e4323cfc9c61 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/util/Preconditions.java @@ -111,7 +111,8 @@ * @since 2.0 */ public final class Preconditions { - private Preconditions() {} + private Preconditions() { + } /** * Ensures the truth of an expression involving one or more parameters to the calling method. diff --git a/java/pom.xml b/java/pom.xml index 7871303634976..6442987f5a192 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -34,9 +34,9 @@ 2.0.11 33.0.0-jre 4.1.106.Final - 1.60.0 + 1.61.1 3.23.1 - 2.16.0 + 2.16.1 3.3.6 23.5.26 1.11.3 @@ -304,7 +304,7 @@ com.puppycrawl.tools checkstyle - 8.19 + 8.29 org.slf4j @@ -364,7 +364,7 @@ org.cyclonedx cyclonedx-maven-plugin - 2.7.10 + 2.7.11 package @@ -395,7 +395,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins @@ -598,7 +598,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins @@ -803,7 +803,7 @@ org.apache.maven.plugins maven-project-info-reports-plugin - 3.0.0 + 3.5.0 org.apache.maven.plugins diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java index bb7cedeb74579..3d9bca58a763c 100644 --- a/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileToStream.java @@ -34,7 +34,8 @@ * first argument and the output is written to standard out. */ public class FileToStream { - private FileToStream() {} + private FileToStream() { + } /** * Reads an Arrow file from in and writes it back to out. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java index 6824756d8aca7..abece39475016 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -22,7 +22,8 @@ /** Helper utility methods for allocating storage for Vectors. */ public class AllocationHelper { - private AllocationHelper() {} + private AllocationHelper() { + } /** * Allocates the vector. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java index 568554ba75ed6..10f343e260ccc 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVectorHelper.java @@ -33,7 +33,8 @@ */ public class BitVectorHelper { - private BitVectorHelper() {} + private BitVectorHelper() { + } /** * Get the index of byte corresponding to bit index in validity buffer. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java index 6cda18a8a53d3..be501ce245410 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/GenerateSampleData.java @@ -27,7 +27,8 @@ * with sample data. This class should be used for that purpose. */ public class GenerateSampleData { - private GenerateSampleData() {} + private GenerateSampleData() { + } /** Populates vector with valueCount random values. */ public static void generateTestData(final ValueVector vector, final int valueCount) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java index d7b147feb152f..3b734bbf6608b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullVector.java @@ -300,7 +300,8 @@ public int getNullCount() { * @param index position of element */ @Override - public void setNull(int index) {} + public void setNull(int index) { + } @Override public boolean isNull(int index) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java index 0de99ab011f66..76db0734464ed 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/Range.java @@ -41,7 +41,8 @@ public class Range { /** * Constructs a new instance. */ - public Range() {} + public Range() { + } /** * Constructs a new instance. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java index 0098f68360a1a..2cd64c4fc6766 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java @@ -23,7 +23,8 @@ * Utility methods for state machines based on enums. */ public class StateTool { - private StateTool() {} + private StateTool() { + } static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StateTool.class); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java index 9c399669affc3..b16315caa9f51 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/ArrowMagic.java @@ -25,7 +25,8 @@ * Magic header/footer helpers for {@link ArrowFileWriter} and {@link ArrowFileReader} formatted files. */ class ArrowMagic { - private ArrowMagic(){} + private ArrowMagic(){ + } private static final byte[] MAGIC = "ARROW1".getBytes(StandardCharsets.UTF_8); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java index 26736ed91c5ca..59b3bb07bcf16 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/message/FBSerializables.java @@ -31,7 +31,8 @@ * Utility methods for {@linkplain org.apache.arrow.vector.ipc.message.FBSerializable}s. */ public class FBSerializables { - private FBSerializables() {} + private FBSerializables() { + } /** * Writes every element of all to builder and calls {@link FlatBufferBuilder#endVector()} afterwards. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java index 9e8b6d26f6fd7..f7f975a0d0e7b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java @@ -26,7 +26,8 @@ /** Utility class for Date, DateTime, TimeStamp, Interval data types. */ public class DateUtility { - private DateUtility() {} + private DateUtility() { + } private static final String UTC = "UTC"; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 0dfb61dcdf269..4635822e5141b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -29,7 +29,8 @@ * Utility methods for configurable precision Decimal values (e.g. {@link BigDecimal}). */ public class DecimalUtility { - private DecimalUtility() {} + private DecimalUtility() { + } public static final byte [] zeroes = new byte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java index 9592f3975ab99..76fb585e6bd3a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DictionaryUtility.java @@ -35,7 +35,8 @@ * Utility methods for working with Dictionaries used in Dictionary encodings. */ public class DictionaryUtility { - private DictionaryUtility() {} + private DictionaryUtility() { + } /** * Convert field and child fields that have a dictionary encoding to message format, so fields diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java index 39488e96efda0..5fa4c1b2260e3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ObjectMapperFactory.java @@ -26,7 +26,8 @@ */ public final class ObjectMapperFactory { - private ObjectMapperFactory() {} + private ObjectMapperFactory() { + } /** * Creates a new {@link ObjectMapper} instance. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java index f8167604c21ad..5b3d00f6b7362 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaUtility.java @@ -33,7 +33,8 @@ * Schema utility class including serialization and deserialization. */ public class SchemaUtility { - private SchemaUtility() {} + private SchemaUtility() { + } /** * Deserialize Arrow schema from byte array. diff --git a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java index f9f0357861c15..9e96e75880522 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/testing/ValueVectorDataPopulator.java @@ -75,7 +75,8 @@ */ public class ValueVectorDataPopulator { - private ValueVectorDataPopulator(){} + private ValueVectorDataPopulator() { + } /** * Populate values for BigIntVector. diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 31f24187e3b37..bc1dd8a09a768 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -667,6 +667,31 @@ def row_num(x): 'b': ["e", "j"], } + def test_chunker_out_of_sync(self): + # GH-39892: if there are newlines in values, the parser may become + # out of sync with the chunker. In this case, we try to produce an + # informative error message. + rows = b"""a,b,c\nd,e,"f\n"\ng,h,i\n""" + expected = { + 'a': ["d", "g"], + 'b': ["e", "h"], + 'c': ["f\n", "i"], + } + for block_size in range(8, 15): + # Sanity check: parsing works with newlines_in_values=True + d = self.read_bytes( + rows, parse_options=ParseOptions(newlines_in_values=True), + read_options=ReadOptions(block_size=block_size)).to_pydict() + assert d == expected + # With these block sizes, a block would end on the physical newline + # inside the quoted cell value, leading to a mismatch between + # CSV chunker and parser. + for block_size in range(8, 11): + with pytest.raises(ValueError, + match="cell values spanning multiple lines"): + self.read_bytes( + rows, read_options=ReadOptions(block_size=block_size)) + class BaseCSVTableRead(BaseTestCSV): diff --git a/testing b/testing index ad82a736c170e..25d16511e8d42 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit ad82a736c170e97b7c8c035ebd8a801c17eec170 +Subproject commit 25d16511e8d42c2744a1d94d90169e3a36e92631