diff --git a/.clang-format b/.clang-format index 9448dc8d8c80d..abd823c103904 100644 --- a/.clang-format +++ b/.clang-format @@ -19,3 +19,4 @@ BasedOnStyle: Google ColumnLimit: 90 DerivePointerAlignment: false IncludeBlocks: Preserve +IndentPPDirectives: AfterHash diff --git a/.dockerignore b/.dockerignore index 3791cca95e3fe..1f1715d8e833d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -27,11 +27,11 @@ # include explicitly !ci/** !c_glib/Gemfile -!dev/archery/setup.py !dev/release/setup-*.sh !docs/requirements*.txt +!go/go.mod +!go/go.sum !python/requirements*.txt -!python/manylinux1/** !r/DESCRIPTION !ruby/Gemfile !ruby/red-arrow/Gemfile @@ -46,20 +46,3 @@ !ruby/red-parquet/Gemfile !ruby/red-parquet/lib/parquet/version.rb !ruby/red-parquet/red-parquet.gemspec -!ruby/red-plasma/Gemfile -!ruby/red-plasma/lib/plasma/version.rb -!ruby/red-plasma/red-plasma.gemspec -!rust/Cargo.toml -!rust/benchmarks/Cargo.toml -!rust/arrow/Cargo.toml -!rust/arrow/benches -!rust/arrow-flight/Cargo.toml -!rust/parquet/Cargo.toml -!rust/parquet/build.rs -!rust/parquet_derive/Cargo.toml -!rust/parquet_derive_test/Cargo.toml -!rust/datafusion/Cargo.toml -!rust/datafusion/benches -!rust/integration-testing/Cargo.toml -!go/go.mod -!go/go.sum \ No newline at end of file diff --git a/.env b/.env index 21f904c3208f6..c8c236d5ac44b 100644 --- a/.env +++ b/.env @@ -58,8 +58,8 @@ CUDA=11.2.2 DASK=latest DOTNET=8.0 GCC_VERSION="" -GO=1.21.8 -STATICCHECK=v0.4.7 +GO=1.22.6 +STATICCHECK=v0.5.1 HDFS=3.2.1 JDK=11 KARTOTHEK=latest @@ -71,6 +71,7 @@ NUMBA=latest NUMPY=latest PANDAS=latest PYTHON=3.8 +PYTHON_IMAGE_TAG=3.8 R=4.4 SPARK=master TURBODBC=latest diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 7d9ff2f42e887..7ba9744ef005d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -24,13 +24,6 @@ updates: commit-message: prefix: "MINOR: [CI] " open-pull-requests-limit: 10 - - package-ecosystem: "gomod" - directory: "/go/" - schedule: - interval: "weekly" - commit-message: - prefix: "MINOR: [Go] " - open-pull-requests-limit: 10 - package-ecosystem: "maven" directory: "/java/" schedule: diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index b016f7d11b9fa..e448209056d78 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -20,12 +20,14 @@ name: Archery & Crossbow on: push: paths: + - '.dockerignore' - '.github/workflows/archery.yml' - 'dev/archery/**' - 'dev/tasks/**' - 'docker-compose.yml' pull_request: paths: + - '.dockerignore' - '.github/workflows/archery.yml' - 'dev/archery/**' - 'dev/tasks/**' @@ -58,7 +60,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 with: python-version: '3.9' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 1138c0a02f812..b7af4c5800835 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -41,7 +41,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index a82e1eb76660b..f5c8b6a7201be 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -20,6 +20,7 @@ name: C++ on: push: paths: + - '.dockerignore' - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' @@ -35,6 +36,7 @@ on: - 'testing' pull_request: paths: + - '.dockerignore' - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' @@ -99,7 +101,6 @@ jobs: cat <> "$GITHUB_OUTPUT" { "arch": "arm64v8", - "archery-use-legacy-docker-compose": "1", "clang-tools": "10", "image": "ubuntu-cpp", "llvm": "10", @@ -124,9 +125,6 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} - # By default, use `docker compose` because docker-compose v1 is obsolete, - # except where the Docker client version is too old. - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }} ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} CLANG_TOOLS: ${{ matrix.clang-tools }} LLVM: ${{ matrix.llvm }} @@ -147,6 +145,7 @@ jobs: run: | sudo apt update sudo apt install -y --no-install-recommends python3 python3-dev python3-pip + python3 -m pip install -U pip - name: Setup Archery run: python3 -m pip install -e dev/archery[docker] - name: Execute Docker Build @@ -156,8 +155,7 @@ jobs: run: | # GH-40558: reduce ASLR to avoid ASAN/LSAN crashes sudo sysctl -w vm.mmap_rnd_bits=28 - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ${{ matrix.image }} - name: Docker Push if: >- @@ -246,7 +244,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -273,7 +271,7 @@ jobs: shell: bash run: | sudo sysctl -w kern.coredump=1 - sudo sysctl -w kern.corefile=core.%N.%P + sudo sysctl -w kern.corefile=/tmp/core.%N.%P ulimit -c unlimited # must enable within the same shell ci/scripts/cpp_test.sh $(pwd) $(pwd)/build @@ -412,12 +410,10 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON ARROW_WITH_ZSTD: ON - # Don't use preinstalled Boost by empty BOOST_ROOT and - # -DBoost_NO_BOOST_CMAKE=ON + # Don't use preinstalled Boost by empty BOOST_ROOT BOOST_ROOT: "" ARROW_CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}} - -DBoost_NO_BOOST_CMAKE=ON -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON # We can't use unity build because we don't have enough memory on # GitHub Actions. @@ -467,16 +463,18 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 + id: python-install with: python-version: 3.9 - name: Install Google Cloud Storage Testbench - shell: bash + shell: msys2 {0} + env: + PIPX_BIN_DIR: /usr/local/bin + PIPX_BASE_PYTHON: ${{ steps.python-install.outputs.python-path }} run: | ci/scripts/install_gcs_testbench.sh default - echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV - name: Test shell: msys2 {0} run: | - PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}" ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build" diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 6e8548dc960f4..c618350affbeb 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -108,7 +108,7 @@ jobs: with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Checkout Arrow diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index cc3ff6330746d..3879a045fd239 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -45,7 +45,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install pre-commit @@ -67,8 +67,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run -e GITHUB_ACTIONS=true ubuntu-lint - name: Docker Push if: >- @@ -104,7 +103,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: '3.12' - name: Install Ruby diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 25db1c39ad89e..1219f7526f9f2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -52,7 +52,7 @@ jobs: key: debian-docs-${{ hashFiles('cpp/**') }} restore-keys: debian-docs- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index ea7fe5d02d7b8..7d540b7cecdc9 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -20,6 +20,7 @@ name: Docs on: pull_request: paths: + - '.dockerignore' - 'docs/**' - '.github/workflows/docs_light.yml' - 'ci/docker/conda.dockerfile' @@ -58,7 +59,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 20c78d86cb2a3..d463549206471 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -20,6 +20,7 @@ name: Go on: push: paths: + - '.dockerignore' - '.github/workflows/go.yml' - 'ci/docker/*_go.dockerfile' - 'ci/scripts/go_*' @@ -27,6 +28,7 @@ on: - 'go/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/go.yml' - 'ci/docker/*_go.dockerfile' - 'ci/docker/**' @@ -62,13 +64,13 @@ jobs: { "arch-label": "AMD64", "arch": "amd64", - "go": "1.21", + "go": "1.22", "runs-on": "ubuntu-latest" }, { "arch-label": "AMD64", "arch": "amd64", - "go": "1.22", + "go": "1.23", "runs-on": "ubuntu-latest" } JSON @@ -78,15 +80,13 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", - "archery-use-legacy-docker-compose": "1", - "go": "1.21", + "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", - "archery-use-legacy-docker-compose": "1", - "go": "1.22", + "go": "1.23", "runs-on": ["self-hosted", "arm", "linux"] } JSON @@ -106,9 +106,6 @@ jobs: include: ${{ fromJson(needs.docker-targets.outputs.targets) }} env: ARCH: ${{ matrix.arch }} - # By default, use Docker CLI because docker-compose v1 is obsolete, - # except where the Docker client version is too old. - ARCHERY_USE_LEGACY_DOCKER_COMPOSE: ${{ matrix.archery-use-legacy-docker-compose || '0' }} GO: ${{ matrix.go }} steps: - name: Checkout Arrow @@ -202,7 +199,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] env: GO: ${{ matrix.go }} steps: @@ -212,7 +209,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -243,7 +240,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] env: GO: ${{ matrix.go }} steps: @@ -252,7 +249,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -282,7 +279,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -315,7 +312,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -328,7 +325,7 @@ jobs: go-version: ${{ matrix.go }} cache: true cache-dependency-path: go/go.sum - - name: Install staticcheck + - name: Install staticcheck run: | . .env go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} @@ -344,7 +341,7 @@ jobs: github.event_name == 'push' && github.repository == 'apache/arrow' && github.ref_name == 'main' - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: '3.10' - name: Run Benchmarks @@ -373,7 +370,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] env: ARROW_GO_TESTCGO: "1" steps: @@ -444,7 +441,7 @@ jobs: ci/scripts/msys2_setup.sh cgo - name: Get required Go version run: | - (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV + (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - name: Update CGO Env vars shell: msys2 {0} run: | diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 43f8af0a600d8..b73f900e616f5 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -20,6 +20,7 @@ name: Integration on: push: paths: + - '.dockerignore' - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' @@ -33,6 +34,7 @@ on: - 'format/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' @@ -89,7 +91,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -98,7 +100,8 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: > + run: | + source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 0317879b580ba..57f834bcbabee 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -20,6 +20,7 @@ name: Java on: push: paths: + - '.dockerignore' - '.github/workflows/java.yml' - 'ci/docker/*java*' - 'ci/scripts/java*.sh' @@ -29,6 +30,7 @@ on: - 'java/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/java.yml' - 'ci/docker/*java*' - 'ci/scripts/java*.sh' @@ -76,7 +78,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index c2bc679e681a2..e730a5bf3e672 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -20,6 +20,7 @@ name: Java JNI on: push: paths: + - '.dockerignore' - '.github/workflows/java_jni.yml' - 'ci/docker/**' - 'ci/scripts/cpp_build.sh' @@ -29,6 +30,7 @@ on: - 'java/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/java_jni.yml' - 'ci/docker/**' - 'ci/scripts/cpp_build.sh' @@ -70,7 +72,7 @@ jobs: key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} restore-keys: java-jni-manylinux-2014- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -79,7 +81,9 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run java-jni-manylinux-2014 + run: | + source ci/scripts/util_enable_core_dumps.sh + archery docker run java-jni-manylinux-2014 - name: Docker Push if: >- success() && @@ -110,7 +114,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index 72afb6dbf1c1d..0bf0c27288faf 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -58,7 +58,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 630bef61105f6..9ab4edf0851cd 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -20,12 +20,14 @@ name: NodeJS on: push: paths: + - '.dockerignore' - '.github/workflows/js.yml' - 'ci/docker/*js.dockerfile' - 'ci/scripts/js_*' - 'js/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/js.yml' - 'ci/docker/*js.dockerfile' - 'ci/scripts/js_*' @@ -54,7 +56,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -64,8 +66,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run debian-js - name: Docker Push if: >- diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 7dd06b6aeec09..bbb1a2d7228d0 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index 0cd89b3206715..68f922ce8b4d9 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4.4.0 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 916db2580e371..45efd305aa8f6 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -20,6 +20,7 @@ name: Python on: push: paths: + - '.dockerignore' - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' @@ -27,6 +28,7 @@ on: - 'python/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' @@ -59,6 +61,7 @@ jobs: - conda-python-3.9-nopandas - conda-python-3.8-pandas-1.0 - conda-python-3.10-pandas-latest + - conda-python-3.10-no-numpy include: - name: conda-python-docs cache: conda-python-3.9 @@ -83,6 +86,11 @@ jobs: title: AMD64 Conda Python 3.10 Pandas latest python: "3.10" pandas: latest + - name: conda-python-3.10-no-numpy + cache: conda-python-3.10 + image: conda-python-no-numpy + title: AMD64 Conda Python 3.10 without NumPy + python: "3.10" env: PYTHON: ${{ matrix.python || 3.8 }} UBUNTU: ${{ matrix.ubuntu || 20.04 }} @@ -101,7 +109,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -111,8 +119,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ${{ matrix.image }} - name: Docker Push if: >- @@ -163,7 +170,7 @@ jobs: ARROW_BUILD_TESTS: OFF PYARROW_TEST_LARGE_MEMORY: ON # Current oldest supported version according to https://endoflife.date/macos - MACOSX_DEPLOYMENT_TARGET: 10.15 + MACOSX_DEPLOYMENT_TARGET: 12.0 steps: - name: Checkout Arrow uses: actions/checkout@v4 @@ -171,7 +178,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 with: python-version: '3.11' - name: Install Dependencies diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 2820d42470bca..92e0e63fb7ea5 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -20,6 +20,7 @@ name: R on: push: paths: + - '.dockerignore' - ".github/workflows/r.yml" - "ci/docker/**" - "ci/etc/rprofile" @@ -32,6 +33,7 @@ on: - "r/**" pull_request: paths: + - '.dockerignore' - ".github/workflows/r.yml" - "ci/docker/**" - "ci/etc/rprofile" @@ -146,7 +148,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -156,8 +158,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh # Setting a non-default and non-probable Marquesas French Polynesia time # it has both with a .45 offset and very very few people who live there. archery docker run -e TZ=MART -e ARROW_R_FORCE_TESTS=${{ matrix.force-tests }} ubuntu-r @@ -169,9 +170,9 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: test-output + name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: >- @@ -206,7 +207,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -216,8 +217,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh # Don't set a TZ here to test that case. These builds will have the following warning in them: # System has not been booted with systemd as init system (PID 1). Can't operate. # Failed to connect to bus: Host is down @@ -230,9 +230,9 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: test-output + name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: >- @@ -292,7 +292,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # # v4.0.0 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip @@ -330,7 +330,7 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4.1.8 with: name: libarrow-rtools40-ucrt64.zip path: r/windows diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 1ec071b6bbb5e..9817e41d3b61d 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index e4d650e74a8ad..05b7b317ffd96 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -20,6 +20,7 @@ name: C GLib & Ruby on: push: paths: + - '.dockerignore' - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' @@ -33,6 +34,7 @@ on: - 'ruby/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' @@ -83,7 +85,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -93,8 +95,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run \ -e ARROW_FLIGHT=ON \ -e ARROW_FLIGHT_SQL=ON \ @@ -406,7 +407,10 @@ jobs: -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json" - name: Build C++ vcpkg dependencies run: | - vcpkg\vcpkg.exe install --triplet $env:VCPKG_TRIPLET --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed + vcpkg\vcpkg.exe install ` + --triplet $env:VCPKG_TRIPLET ` + --x-manifest-root cpp ` + --x-install-root build\cpp\vcpkg_installed - name: Build C++ shell: cmd run: | diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 1b3c9eca1814a..87aa5cb83f714 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -20,6 +20,7 @@ name: Swift on: push: paths: + - '.dockerignore' - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' @@ -27,6 +28,7 @@ on: - 'swift/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' @@ -63,8 +65,7 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} run: | - sudo sysctl -w kernel.core_pattern="core.%e.%p" - ulimit -c unlimited + source ci/scripts/util_enable_core_dumps.sh archery docker run ubuntu-swift - name: Docker Push if: >- diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf0bcde14622a..91017969eb502 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -78,6 +78,26 @@ repos: ?^cpp/src/generated/| ?^cpp/thirdparty/| ) + - repo: https://github.com/cpplint/cpplint + rev: 1.6.1 + hooks: + - id: cpplint + name: C++ Lint + args: + - "--verbose=2" + types_or: + - c++ + files: >- + ^cpp/ + exclude: >- + ( + ?\.grpc\.fb\.(cc|h)$| + ?\.pb\.(cc|h)$| + ?_generated.*\.(cc|h)$| + ?^cpp/src/arrow/vendored/| + ?^cpp/src/generated/| + ?^cpp/thirdparty/| + ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: diff --git a/CPPLINT.cfg b/CPPLINT.cfg new file mode 100644 index 0000000000000..2f47b4dbf57b7 --- /dev/null +++ b/CPPLINT.cfg @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +filter = -build/c++11 +filter = -build/header_guard +filter = -build/include_order +filter = -build/include_what_you_use +filter = -readability/alt_tokens +# readability/casting is disabled as it aggressively warns about +# functions with names like "int32", so "int32(x)", where int32 is a +# function name, warns with +filter = -readability/casting +filter = -readability/todo +filter = -runtime/references +filter = -whitespace/comments +linelength = 90 diff --git a/appveyor.yml b/appveyor.yml index 5954251d34733..9e4582f1d8d7f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -24,6 +24,7 @@ only_commits: - appveyor.yml - ci/appveyor* - ci/conda* + - ci/scripts/*.bat - cpp/ - format/ - python/ diff --git a/c_glib/arrow-flight-glib/client.cpp b/c_glib/arrow-flight-glib/client.cpp index 80c47e336f872..75b02ec25869f 100644 --- a/c_glib/arrow-flight-glib/client.cpp +++ b/c_glib/arrow-flight-glib/client.cpp @@ -33,10 +33,19 @@ G_BEGIN_DECLS * #GAFlightStreamReader is a class for reading record batches from a * server. * + * #GAFlightStreamWriter is a class for writing record batches to a + * server. + * + * #GAFlightMetadataReader is a class for reading metadata from a + * server. + * * #GAFlightCallOptions is a class for options of each call. * * #GAFlightClientOptions is a class for options of each client. * + * #GAFlightDoPutResult is a class that has gaflight_client_do_put() + * result. + * * #GAFlightClient is a class for Apache Arrow Flight client. * * Since: 5.0.0 @@ -56,6 +65,128 @@ gaflight_stream_reader_class_init(GAFlightStreamReaderClass *klass) { } +G_DEFINE_TYPE(GAFlightStreamWriter, + gaflight_stream_writer, + GAFLIGHT_TYPE_RECORD_BATCH_WRITER) + +static void +gaflight_stream_writer_init(GAFlightStreamWriter *object) +{ +} + +static void +gaflight_stream_writer_class_init(GAFlightStreamWriterClass *klass) +{ +} + +/** + * gaflight_stream_writer_done_writing: + * @writer: A #GAFlightStreamWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error) +{ + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); + return garrow::check(error, + flight_writer->DoneWriting(), + "[flight-stream-writer][done-writing]"); +} + +struct GAFlightMetadataReaderPrivate +{ + arrow::flight::FlightMetadataReader *reader; +}; + +enum { + PROP_METADATA_READER_READER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataReader, + gaflight_metadata_reader, + G_TYPE_OBJECT) + +#define GAFLIGHT_METADATA_READER_GET_PRIVATE(object) \ + static_cast( \ + gaflight_metadata_reader_get_instance_private(GAFLIGHT_METADATA_READER(object))) + +static void +gaflight_metadata_reader_finalize(GObject *object) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object); + delete priv->reader; + G_OBJECT_CLASS(gaflight_metadata_reader_parent_class)->finalize(object); +} + +static void +gaflight_metadata_reader_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_METADATA_READER_READER: + priv->reader = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_metadata_reader_init(GAFlightMetadataReader *object) +{ +} + +static void +gaflight_metadata_reader_class_init(GAFlightMetadataReaderClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_metadata_reader_finalize; + gobject_class->set_property = gaflight_metadata_reader_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "reader", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_METADATA_READER_READER, spec); +} + +/** + * gaflight_metadata_reader_read: + * @reader: A #GAFlightMetadataReader. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): The metadata on success, %NULL on error. + * + * Since: 18.0.0 + */ +GArrowBuffer * +gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error) +{ + auto flight_reader = gaflight_metadata_reader_get_raw(reader); + std::shared_ptr metadata; + if (garrow::check(error, + flight_reader->ReadMetadata(&metadata), + "[flight-metadata-reader][read]")) { + return garrow_buffer_new_raw(&metadata); + } else { + return nullptr; + } +} + typedef struct GAFlightCallOptionsPrivate_ { arrow::flight::FlightCallOptions options; @@ -385,6 +516,139 @@ gaflight_client_options_new(void) g_object_new(GAFLIGHT_TYPE_CLIENT_OPTIONS, NULL)); } +struct GAFlightDoPutResultPrivate +{ + GAFlightStreamWriter *writer; + GAFlightMetadataReader *reader; +}; + +enum { + PROP_DO_PUT_RESULT_RESULT = 1, + PROP_DO_PUT_RESULT_WRITER, + PROP_DO_PUT_RESULT_READER, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightDoPutResult, gaflight_do_put_result, G_TYPE_OBJECT) + +#define GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object) \ + static_cast( \ + gaflight_do_put_result_get_instance_private(GAFLIGHT_DO_PUT_RESULT(object))) + +static void +gaflight_do_put_result_dispose(GObject *object) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + if (priv->writer) { + g_object_unref(priv->writer); + priv->writer = nullptr; + } + + if (priv->reader) { + g_object_unref(priv->reader); + priv->reader = nullptr; + } + + G_OBJECT_CLASS(gaflight_do_put_result_parent_class)->dispose(object); +} + +static void +gaflight_do_put_result_init(GAFlightDoPutResult *object) +{ +} + +static void +gaflight_do_put_result_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DO_PUT_RESULT_RESULT: + { + auto result = static_cast( + g_value_get_pointer(value)); + std::shared_ptr writer = + std::move(result->writer); + priv->writer = gaflight_stream_writer_new_raw(&writer); + priv->reader = gaflight_metadata_reader_new_raw(result->reader.release()); + break; + } + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_do_put_result_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_DO_PUT_RESULT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_DO_PUT_RESULT_WRITER: + g_value_set_object(value, priv->writer); + break; + case PROP_DO_PUT_RESULT_READER: + g_value_set_object(value, priv->reader); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_do_put_result_class_init(GAFlightDoPutResultClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gaflight_do_put_result_dispose; + gobject_class->set_property = gaflight_do_put_result_set_property; + gobject_class->get_property = gaflight_do_put_result_get_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "result", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_RESULT, spec); + + /** + * GAFlightDoPutResult:writer: + * + * A writer to write record batches to. + * + * Since: 18.0.0 + */ + spec = g_param_spec_object("writer", + nullptr, + nullptr, + GAFLIGHT_TYPE_STREAM_WRITER, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_WRITER, spec); + + /** + * GAFlightDoPutResult:reader: + * + * A reader for application metadata from the server. + * + * Since: 18.0.0 + */ + spec = g_param_spec_object("reader", + nullptr, + nullptr, + GAFLIGHT_TYPE_METADATA_READER, + static_cast(G_PARAM_READABLE)); + g_object_class_install_property(gobject_class, PROP_DO_PUT_RESULT_READER, spec); +} + struct GAFlightClientPrivate { std::shared_ptr client; @@ -661,6 +925,51 @@ gaflight_client_do_get(GAFlightClient *client, return gaflight_stream_reader_new_raw(flight_reader.release(), TRUE); } +/** + * gaflight_client_do_put: + * @client: A #GAFlightClient. + * @descriptor: A #GAFlightDescriptor. + * @schema: A #GArrowSchema. + * @options: (nullable): A #GAFlightCallOptions. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Upload data to a Flight described by the given descriptor. The + * caller must call garrow_record_batch_writer_close() on the + * returned stream once they are done writing. + * + * The reader and writer are linked; closing the writer will also + * close the reader. Use garrow_flight_stream_writer_done_writing() to + * only close the write side of the channel. + * + * Returns: (nullable) (transfer full): + * The #GAFlighDoPutResult holding a reader and a writer on success, + * %NULL on error. + * + * Since: 18.0.0 + */ +GAFlightDoPutResult * +gaflight_client_do_put(GAFlightClient *client, + GAFlightDescriptor *descriptor, + GArrowSchema *schema, + GAFlightCallOptions *options, + GError **error) +{ + auto flight_client = gaflight_client_get_raw(client); + auto flight_descriptor = gaflight_descriptor_get_raw(descriptor); + auto arrow_schema = garrow_schema_get_raw(schema); + arrow::flight::FlightCallOptions flight_default_options; + auto flight_options = &flight_default_options; + if (options) { + flight_options = gaflight_call_options_get_raw(options); + } + auto result = flight_client->DoPut(*flight_options, *flight_descriptor, arrow_schema); + if (!garrow::check(error, result, "[flight-client][do-put]")) { + return nullptr; + } + auto flight_result = std::move(*result); + return gaflight_do_put_result_new_raw(&flight_result); +} + G_END_DECLS GAFlightStreamReader * @@ -672,7 +981,31 @@ gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader, flight_reader, "is-owner", is_owner, - NULL)); + nullptr)); +} + +GAFlightStreamWriter * +gaflight_stream_writer_new_raw( + std::shared_ptr *flight_writer) +{ + return GAFLIGHT_STREAM_WRITER(g_object_new(GAFLIGHT_TYPE_STREAM_WRITER, + "record-batch-writer", + flight_writer, + nullptr)); +} + +GAFlightMetadataReader * +gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader) +{ + return GAFLIGHT_METADATA_READER( + g_object_new(GAFLIGHT_TYPE_METADATA_READER, "reader", flight_reader, nullptr)); +} + +arrow::flight::FlightMetadataReader * +gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader) +{ + auto priv = GAFLIGHT_METADATA_READER_GET_PRIVATE(reader); + return priv->reader; } arrow::flight::FlightCallOptions * @@ -689,6 +1022,13 @@ gaflight_client_options_get_raw(GAFlightClientOptions *options) return &(priv->options); } +GAFlightDoPutResult * +gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result) +{ + return GAFLIGHT_DO_PUT_RESULT( + g_object_new(GAFLIGHT_TYPE_DO_PUT_RESULT, "result", flight_result, nullptr)); +} + std::shared_ptr gaflight_client_get_raw(GAFlightClient *client) { diff --git a/c_glib/arrow-flight-glib/client.h b/c_glib/arrow-flight-glib/client.h index a91bbe55e3c04..12c5a06b810e1 100644 --- a/c_glib/arrow-flight-glib/client.h +++ b/c_glib/arrow-flight-glib/client.h @@ -35,6 +35,35 @@ struct _GAFlightStreamReaderClass GAFlightRecordBatchReaderClass parent_class; }; +#define GAFLIGHT_TYPE_STREAM_WRITER (gaflight_stream_writer_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE(GAFlightStreamWriter, + gaflight_stream_writer, + GAFLIGHT, + STREAM_WRITER, + GAFlightRecordBatchWriter) +struct _GAFlightStreamWriterClass +{ + GAFlightRecordBatchWriterClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_stream_writer_done_writing(GAFlightStreamWriter *writer, GError **error); + +#define GAFLIGHT_TYPE_METADATA_READER (gaflight_metadata_reader_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightMetadataReader, gaflight_metadata_reader, GAFLIGHT, METADATA_READER, GObject) +struct _GAFlightMetadataReaderClass +{ + GObjectClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +GArrowBuffer * +gaflight_metadata_reader_read(GAFlightMetadataReader *reader, GError **error); + #define GAFLIGHT_TYPE_CALL_OPTIONS (gaflight_call_options_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE( @@ -75,6 +104,15 @@ GAFLIGHT_AVAILABLE_IN_5_0 GAFlightClientOptions * gaflight_client_options_new(void); +#define GAFLIGHT_TYPE_DO_PUT_RESULT (gaflight_do_put_result_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightDoPutResult, gaflight_do_put_result, GAFLIGHT, DO_PUT_RESULT, GObject) +struct _GAFlightDoPutResultClass +{ + GObjectClass parent_class; +}; + #define GAFLIGHT_TYPE_CLIENT (gaflight_client_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightClient, gaflight_client, GAFLIGHT, CLIENT, GObject) @@ -124,4 +162,12 @@ gaflight_client_do_get(GAFlightClient *client, GAFlightCallOptions *options, GError **error); +GAFLIGHT_AVAILABLE_IN_18_0 +GAFlightDoPutResult * +gaflight_client_do_put(GAFlightClient *client, + GAFlightDescriptor *descriptor, + GArrowSchema *schema, + GAFlightCallOptions *options, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-flight-glib/client.hpp b/c_glib/arrow-flight-glib/client.hpp index 185a28e6dc4bd..32ad35845aa12 100644 --- a/c_glib/arrow-flight-glib/client.hpp +++ b/c_glib/arrow-flight-glib/client.hpp @@ -28,6 +28,19 @@ GAFlightStreamReader * gaflight_stream_reader_new_raw(arrow::flight::FlightStreamReader *flight_reader, gboolean is_owner); +GAFLIGHT_EXTERN +GAFlightStreamWriter * +gaflight_stream_writer_new_raw( + std::shared_ptr *flight_writer); + +GAFLIGHT_EXTERN +GAFlightMetadataReader * +gaflight_metadata_reader_new_raw(arrow::flight::FlightMetadataReader *flight_reader); + +GAFLIGHT_EXTERN +arrow::flight::FlightMetadataReader * +gaflight_metadata_reader_get_raw(GAFlightMetadataReader *reader); + GAFLIGHT_EXTERN arrow::flight::FlightCallOptions * gaflight_call_options_get_raw(GAFlightCallOptions *options); @@ -36,6 +49,10 @@ GAFLIGHT_EXTERN arrow::flight::FlightClientOptions * gaflight_client_options_get_raw(GAFlightClientOptions *options); +GAFLIGHT_EXTERN +GAFlightDoPutResult * +gaflight_do_put_result_new_raw(arrow::flight::FlightClient::DoPutResult *flight_result); + GAFLIGHT_EXTERN std::shared_ptr gaflight_client_get_raw(GAFlightClient *client); diff --git a/c_glib/arrow-flight-glib/common.cpp b/c_glib/arrow-flight-glib/common.cpp index f7eea08c264b3..3deaf67cc14e8 100644 --- a/c_glib/arrow-flight-glib/common.cpp +++ b/c_glib/arrow-flight-glib/common.cpp @@ -1196,7 +1196,7 @@ gaflight_record_batch_reader_finalize(GObject *object) if (priv->is_owner) { delete priv->reader; } - G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); + G_OBJECT_CLASS(gaflight_record_batch_reader_parent_class)->finalize(object); } static void @@ -1300,57 +1300,9 @@ gaflight_record_batch_reader_read_all(GAFlightRecordBatchReader *reader, GError } } -typedef struct GAFlightRecordBatchWriterPrivate_ -{ - arrow::flight::MetadataRecordBatchWriter *writer; - bool is_owner; -} GAFlightRecordBatchWriterPrivate; - -enum { - PROP_RECORD_BATCH_WRITER_WRITER = 1, - PROP_RECORD_BATCH_WRITER_IS_OWNER, -}; - -G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GAFlightRecordBatchWriter, - gaflight_record_batch_writer, - GARROW_TYPE_RECORD_BATCH_WRITER) - -#define GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object) \ - static_cast( \ - gaflight_record_batch_writer_get_instance_private( \ - GAFLIGHT_RECORD_BATCH_WRITER(object))) - -static void -gaflight_record_batch_writer_finalize(GObject *object) -{ - auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object); - if (priv->is_owner) { - delete priv->writer; - } - G_OBJECT_CLASS(gaflight_info_parent_class)->finalize(object); -} - -static void -gaflight_record_batch_writer_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_RECORD_BATCH_WRITER_WRITER: - priv->writer = - static_cast(g_value_get_pointer(value)); - break; - case PROP_RECORD_BATCH_WRITER_IS_OWNER: - priv->is_owner = g_value_get_boolean(value); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} +G_DEFINE_ABSTRACT_TYPE(GAFlightRecordBatchWriter, + gaflight_record_batch_writer, + GARROW_TYPE_RECORD_BATCH_WRITER) static void gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object) @@ -1360,26 +1312,6 @@ gaflight_record_batch_writer_init(GAFlightRecordBatchWriter *object) static void gaflight_record_batch_writer_class_init(GAFlightRecordBatchWriterClass *klass) { - auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = gaflight_record_batch_writer_finalize; - gobject_class->set_property = gaflight_record_batch_writer_set_property; - - GParamSpec *spec; - spec = g_param_spec_pointer( - "writer", - nullptr, - nullptr, - static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_WRITER, spec); - - spec = g_param_spec_boolean( - "is-owner", - nullptr, - nullptr, - TRUE, - static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_RECORD_BATCH_WRITER_IS_OWNER, spec); } /** @@ -1402,7 +1334,8 @@ gaflight_record_batch_writer_begin(GAFlightRecordBatchWriter *writer, GArrowWriteOptions *options, GError **error) { - auto flight_writer = gaflight_record_batch_writer_get_raw(writer); + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); auto arrow_schema = garrow_schema_get_raw(schema); arrow::ipc::IpcWriteOptions arrow_write_options; if (options) { @@ -1432,7 +1365,8 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, GArrowBuffer *metadata, GError **error) { - auto flight_writer = gaflight_record_batch_writer_get_raw(writer); + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); auto arrow_metadata = garrow_buffer_get_raw(metadata); return garrow::check(error, flight_writer->WriteMetadata(arrow_metadata), @@ -1440,7 +1374,7 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, } /** - * gaflight_record_batch_writer_write: + * gaflight_record_batch_writer_write_record_batch: * @writer: A #GAFlightRecordBatchWriter. * @record_batch: A #GArrowRecordBatch. * @metadata: (nullable): A #GArrowBuffer. @@ -1453,12 +1387,13 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, * Since: 18.0.0 */ gboolean -gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer, - GArrowRecordBatch *record_batch, - GArrowBuffer *metadata, - GError **error) +gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer, + GArrowRecordBatch *record_batch, + GArrowBuffer *metadata, + GError **error) { - auto flight_writer = gaflight_record_batch_writer_get_raw(writer); + auto flight_writer = std::static_pointer_cast( + garrow_record_batch_writer_get_raw(GARROW_RECORD_BATCH_WRITER(writer))); auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); auto arrow_metadata = garrow_buffer_get_raw(metadata); return garrow::check( @@ -1599,10 +1534,3 @@ gaflight_record_batch_reader_get_raw(GAFlightRecordBatchReader *reader) auto priv = GAFLIGHT_RECORD_BATCH_READER_GET_PRIVATE(reader); return priv->reader; } - -arrow::flight::MetadataRecordBatchWriter * -gaflight_record_batch_writer_get_raw(GAFlightRecordBatchWriter *writer) -{ - auto priv = GAFLIGHT_RECORD_BATCH_WRITER_GET_PRIVATE(writer); - return priv->writer; -} diff --git a/c_glib/arrow-flight-glib/common.h b/c_glib/arrow-flight-glib/common.h index 91c828caabb36..726132fe4921b 100644 --- a/c_glib/arrow-flight-glib/common.h +++ b/c_glib/arrow-flight-glib/common.h @@ -259,9 +259,9 @@ gaflight_record_batch_writer_write_metadata(GAFlightRecordBatchWriter *writer, GAFLIGHT_AVAILABLE_IN_18_0 gboolean -gaflight_record_batch_writer_write(GAFlightRecordBatchWriter *writer, - GArrowRecordBatch *record_batch, - GArrowBuffer *metadata, - GError **error); +gaflight_record_batch_writer_write_record_batch(GAFlightRecordBatchWriter *writer, + GArrowRecordBatch *record_batch, + GArrowBuffer *metadata, + GError **error); G_END_DECLS diff --git a/c_glib/arrow-flight-glib/server.cpp b/c_glib/arrow-flight-glib/server.cpp index f7444918e90f6..e39fd97b0d06c 100644 --- a/c_glib/arrow-flight-glib/server.cpp +++ b/c_glib/arrow-flight-glib/server.cpp @@ -45,6 +45,9 @@ G_BEGIN_DECLS * client. Also allows reading application-defined metadata via the * Flight protocol. * + * #GAFlightMetadataWriter is a class for sending application-specific + * metadata back to client during an upload. + * * #GAFlightServerAuthSender is a class for sending messages to the * client during an authentication handshake. * @@ -290,6 +293,98 @@ gaflight_message_reader_get_descriptor(GAFlightMessageReader *reader) return gaflight_descriptor_new_raw(&flight_descriptor); } +struct GAFlightMetadataWriterPrivate +{ + arrow::flight::FlightMetadataWriter *writer; +}; + +enum { + PROP_WRITER = 1, +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GAFlightMetadataWriter, + gaflight_metadata_writer, + G_TYPE_OBJECT) + +#define GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object) \ + static_cast( \ + gaflight_metadata_writer_get_instance_private(GAFLIGHT_METADATA_WRITER(object))) + +static void +gaflight_metadata_writer_finalize(GObject *object) +{ + auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object); + + delete priv->writer; + + G_OBJECT_CLASS(gaflight_metadata_writer_parent_class)->finalize(object); +} + +static void +gaflight_metadata_writer_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_WRITER: + priv->writer = + static_cast(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gaflight_metadata_writer_init(GAFlightMetadataWriter *object) +{ +} + +static void +gaflight_metadata_writer_class_init(GAFlightMetadataWriterClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = gaflight_metadata_writer_finalize; + gobject_class->set_property = gaflight_metadata_writer_set_property; + + GParamSpec *spec; + spec = g_param_spec_pointer( + "writer", + nullptr, + nullptr, + static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_WRITER, spec); +} + +/** + * gaflight_metadata_writer_write: + * @writer: A #GAFlightMetadataWriter. + * @metadata: A #GArrowBuffer to be sent. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Writes metadata to the client. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_metadata_writer_write(GAFlightMetadataWriter *writer, + GArrowBuffer *metadata, + GError **error) +{ + auto flight_writer = gaflight_metadata_writer_get_raw(writer); + auto flight_metadata = garrow_buffer_get_raw(metadata); + return garrow::check(error, + flight_writer->WriteMetadata(*flight_metadata), + "[flight-metadata-writer][write]"); +} + struct GAFlightServerCallContextPrivate { arrow::flight::ServerCallContext *call_context; @@ -1034,6 +1129,34 @@ namespace gaflight { return arrow::Status::OK(); } + arrow::Status + DoPut(const arrow::flight::ServerCallContext &context, + std::unique_ptr reader, + std::unique_ptr writer) override + { + auto gacontext = gaflight_server_call_context_new_raw(&context); + auto gareader = gaflight_message_reader_new_raw(reader.release(), TRUE); + auto gawriter = gaflight_metadata_writer_new_raw(writer.release()); + GError *gerror = nullptr; + auto success = + gaflight_server_do_put(gaserver_, gacontext, gareader, gawriter, &gerror); + g_object_unref(gawriter); + g_object_unref(gareader); + g_object_unref(gacontext); + if (!success && !gerror) { + g_set_error(&gerror, + GARROW_ERROR, + GARROW_ERROR_UNKNOWN, + "GAFlightServerClass::do_put() returns FALSE but error isn't set"); + } + if (gerror) { + return garrow_error_to_status(gerror, + arrow::StatusCode::UnknownError, + "[flight-server][do-put]"); + } + return arrow::Status::OK(); + } + private: GAFlightServer *gaserver_; }; @@ -1228,6 +1351,35 @@ gaflight_server_do_get(GAFlightServer *server, return (*(klass->do_get))(server, context, ticket, error); } +/** + * gaflight_server_do_put: + * @server: A #GAFlightServer. + * @context: A #GAFlightServerCallContext. + * @reader: A #GAFlightMessageReader. + * @writer: A #GAFlightMetadataWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Processes a stream of IPC payloads sent from a client. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 18.0.0 + */ +gboolean +gaflight_server_do_put(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightMessageReader *reader, + GAFlightMetadataWriter *writer, + GError **error) +{ + auto klass = GAFLIGHT_SERVER_GET_CLASS(server); + if (!(klass && klass->do_put)) { + g_set_error(error, GARROW_ERROR, GARROW_ERROR_NOT_IMPLEMENTED, "not implemented"); + return false; + } + return klass->do_put(server, context, reader, writer, error); +} + G_END_DECLS arrow::flight::FlightDataStream * @@ -1257,6 +1409,20 @@ gaflight_message_reader_get_raw(GAFlightMessageReader *reader) return static_cast(flight_reader); } +GAFlightMetadataWriter * +gaflight_metadata_writer_new_raw(arrow::flight::FlightMetadataWriter *flight_writer) +{ + return GAFLIGHT_METADATA_WRITER( + g_object_new(GAFLIGHT_TYPE_METADATA_WRITER, "writer", flight_writer, nullptr)); +} + +arrow::flight::FlightMetadataWriter * +gaflight_metadata_writer_get_raw(GAFlightMetadataWriter *writer) +{ + auto priv = GAFLIGHT_METADATA_WRITER_GET_PRIVATE(writer); + return priv->writer; +} + GAFlightServerCallContext * gaflight_server_call_context_new_raw( const arrow::flight::ServerCallContext *flight_call_context) diff --git a/c_glib/arrow-flight-glib/server.h b/c_glib/arrow-flight-glib/server.h index 7e594febb172f..e3a469098b32c 100644 --- a/c_glib/arrow-flight-glib/server.h +++ b/c_glib/arrow-flight-glib/server.h @@ -65,6 +65,21 @@ GAFLIGHT_AVAILABLE_IN_14_0 GAFlightDescriptor * gaflight_message_reader_get_descriptor(GAFlightMessageReader *reader); +#define GAFLIGHT_TYPE_METADATA_WRITER (gaflight_metadata_writer_get_type()) +GAFLIGHT_AVAILABLE_IN_18_0 +G_DECLARE_DERIVABLE_TYPE( + GAFlightMetadataWriter, gaflight_metadata_writer, GAFLIGHT, METADATA_WRITER, GObject) +struct _GAFlightMetadataWriterClass +{ + GObjectClass parent_class; +}; + +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_metadata_writer_write(GAFlightMetadataWriter *writer, + GArrowBuffer *metadata, + GError **error); + #define GAFLIGHT_TYPE_SERVER_CALL_CONTEXT (gaflight_server_call_context_get_type()) GAFLIGHT_AVAILABLE_IN_5_0 G_DECLARE_DERIVABLE_TYPE(GAFlightServerCallContext, @@ -199,6 +214,7 @@ G_DECLARE_DERIVABLE_TYPE(GAFlightServer, gaflight_server, GAFLIGHT, SERVER, GObj * GAFlightServerClass: * @list_flights: A virtual function to implement `ListFlights` API. * @do_get: A virtual function to implement `DoGet` API. + * @do_put: A virtual function to implement `DoPut` API. * * Since: 5.0.0 */ @@ -218,6 +234,11 @@ struct _GAFlightServerClass GAFlightServerCallContext *context, GAFlightTicket *ticket, GError **error); + gboolean (*do_put)(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightMessageReader *reader, + GAFlightMetadataWriter *writer, + GError **error); }; GAFLIGHT_AVAILABLE_IN_5_0 @@ -254,4 +275,12 @@ gaflight_server_do_get(GAFlightServer *server, GAFlightTicket *ticket, GError **error); +GAFLIGHT_AVAILABLE_IN_18_0 +gboolean +gaflight_server_do_put(GAFlightServer *server, + GAFlightServerCallContext *context, + GAFlightMessageReader *reader, + GAFlightMetadataWriter *writer, + GError **error); + G_END_DECLS diff --git a/c_glib/arrow-flight-glib/server.hpp b/c_glib/arrow-flight-glib/server.hpp index ec4815751c8d8..f68eef83781ec 100644 --- a/c_glib/arrow-flight-glib/server.hpp +++ b/c_glib/arrow-flight-glib/server.hpp @@ -36,6 +36,14 @@ GAFLIGHT_EXTERN arrow::flight::FlightMessageReader * gaflight_message_reader_get_raw(GAFlightMessageReader *reader); +GAFLIGHT_EXTERN +GAFlightMetadataWriter * +gaflight_metadata_writer_new_raw(arrow::flight::FlightMetadataWriter *flight_writer); + +GAFLIGHT_EXTERN +arrow::flight::FlightMetadataWriter * +gaflight_metadata_writer_get_raw(GAFlightMetadataWriter *writer); + GAFLIGHT_EXTERN GAFlightServerCallContext * gaflight_server_call_context_new_raw( diff --git a/c_glib/arrow-glib/writer.cpp b/c_glib/arrow-glib/writer.cpp index b0321d51b3ba4..08af1c7976965 100644 --- a/c_glib/arrow-glib/writer.cpp +++ b/c_glib/arrow-glib/writer.cpp @@ -45,14 +45,14 @@ G_BEGIN_DECLS * batches in file format into output. */ -typedef struct GArrowRecordBatchWriterPrivate_ +struct GArrowRecordBatchWriterPrivate { std::shared_ptr record_batch_writer; -} GArrowRecordBatchWriterPrivate; + bool is_closed; +}; enum { - PROP_0, - PROP_RECORD_BATCH_WRITER + PROP_RECORD_BATCH_WRITER = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GArrowRecordBatchWriter, @@ -111,6 +111,7 @@ garrow_record_batch_writer_init(GArrowRecordBatchWriter *object) { auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(object); new (&priv->record_batch_writer) std::shared_ptr; + priv->is_closed = false; } static void @@ -193,7 +194,27 @@ garrow_record_batch_writer_close(GArrowRecordBatchWriter *writer, GError **error auto arrow_writer = garrow_record_batch_writer_get_raw(writer); auto status = arrow_writer->Close(); - return garrow_error_check(error, status, "[record-batch-writer][close]"); + auto success = garrow_error_check(error, status, "[record-batch-writer][close]"); + if (success) { + auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(writer); + priv->is_closed = true; + } + return success; +} + +/** + * garrow_record_batch_writer_is_closed: + * @writer: A #GArrowRecordBatchWriter. + * + * Returns: %TRUE if the writer is closed, %FALSE otherwise. + * + * Since: 18.0.0 + */ +gboolean +garrow_record_batch_writer_is_closed(GArrowRecordBatchWriter *writer) +{ + auto priv = GARROW_RECORD_BATCH_WRITER_GET_PRIVATE(writer); + return priv->is_closed; } G_DEFINE_TYPE(GArrowRecordBatchStreamWriter, diff --git a/c_glib/arrow-glib/writer.h b/c_glib/arrow-glib/writer.h index 46bbdddec8c9d..cea8390d9028f 100644 --- a/c_glib/arrow-glib/writer.h +++ b/c_glib/arrow-glib/writer.h @@ -53,6 +53,10 @@ GARROW_AVAILABLE_IN_ALL gboolean garrow_record_batch_writer_close(GArrowRecordBatchWriter *writer, GError **error); +GARROW_AVAILABLE_IN_18_0 +gboolean +garrow_record_batch_writer_is_closed(GArrowRecordBatchWriter *writer); + #define GARROW_TYPE_RECORD_BATCH_STREAM_WRITER \ (garrow_record_batch_stream_writer_get_type()) GARROW_AVAILABLE_IN_ALL diff --git a/c_glib/arrow-glib/writer.hpp b/c_glib/arrow-glib/writer.hpp index aa87ffe77d79b..1d85ac52f88d1 100644 --- a/c_glib/arrow-glib/writer.hpp +++ b/c_glib/arrow-glib/writer.hpp @@ -25,16 +25,20 @@ #include +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchWriter * garrow_record_batch_writer_new_raw( std::shared_ptr *arrow_writer); +GARROW_AVAILABLE_IN_ALL std::shared_ptr garrow_record_batch_writer_get_raw(GArrowRecordBatchWriter *writer); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchStreamWriter * garrow_record_batch_stream_writer_new_raw( std::shared_ptr *arrow_writer); +GARROW_AVAILABLE_IN_ALL GArrowRecordBatchFileWriter * garrow_record_batch_file_writer_new_raw( std::shared_ptr *arrow_writer); diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index b6f019ed27d46..7a672f1f21dcc 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -316,14 +316,13 @@ gparquet_writer_properties_get_data_page_size(GParquetWriterProperties *properti return parquet_properties->data_pagesize(); } -typedef struct GParquetArrowFileWriterPrivate_ +struct GParquetArrowFileWriterPrivate { parquet::arrow::FileWriter *arrow_file_writer; -} GParquetArrowFileWriterPrivate; +}; enum { - PROP_0, - PROP_ARROW_FILE_WRITER + PROP_ARROW_FILE_WRITER = 1, }; G_DEFINE_TYPE_WITH_PRIVATE(GParquetArrowFileWriter, @@ -496,6 +495,45 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema, } } +/** + * gparquet_arrow_file_writer_get_schema: + * @writer: A #GParquetArrowFileWriter. + * + * Returns: (transfer full): The schema to be written to. + * + * Since: 18.0.0 + */ +GArrowSchema * +gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_schema = parquet_arrow_file_writer->schema(); + return garrow_schema_new_raw(&arrow_schema); +} + +/** + * gparquet_arrow_file_writer_write_record_batch: + * @writer: A #GParquetArrowFileWriter. + * @record_batch: A record batch to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch).get(); + auto status = parquet_arrow_file_writer->WriteRecordBatch(*arrow_record_batch); + return garrow_error_check(error, + status, + "[parquet][arrow][file-writer][write-record-batch]"); +} + /** * gparquet_arrow_file_writer_write_table: * @writer: A #GParquetArrowFileWriter. @@ -510,13 +548,57 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema, gboolean gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GArrowTable *table, - guint64 chunk_size, + gsize chunk_size, GError **error) { auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); auto arrow_table = garrow_table_get_raw(table).get(); - auto status = parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size); - return garrow_error_check(error, status, "[parquet][arrow][file-writer][write-table]"); + return garrow::check(error, + parquet_arrow_file_writer->WriteTable(*arrow_table, chunk_size), + "[parquet][arrow][file-writer][write-table]"); +} + +/** + * gparquet_arrow_file_writer_new_row_group: + * @writer: A #GParquetArrowFileWriter. + * @chunk_size: The max number of rows in a row group. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, + gsize chunk_size, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + return garrow::check(error, + parquet_arrow_file_writer->NewRowGroup(chunk_size), + "[parquet][arrow][file-writer][new-row-group]"); +} + +/** + * gparquet_arrow_file_writer_write_chunked_array: + * @writer: A #GParquetArrowFileWriter. + * @chunked_array: A #GArrowChunkedArray to be written. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer, + GArrowChunkedArray *chunked_array, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array); + return garrow::check(error, + parquet_arrow_file_writer->WriteColumnChunk(arrow_chunked_array), + "[parquet][arrow][file-writer][write-chunked-array]"); } /** diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h index 71cbfa195e842..40595bdfef4b9 100644 --- a/c_glib/parquet-glib/arrow-file-writer.h +++ b/c_glib/parquet-glib/arrow-file-writer.h @@ -116,13 +116,35 @@ gparquet_arrow_file_writer_new_path(GArrowSchema *schema, GParquetWriterProperties *writer_properties, GError **error); +GPARQUET_AVAILABLE_IN_18_0 +GArrowSchema * +gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer); + +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_write_record_batch(GParquetArrowFileWriter *writer, + GArrowRecordBatch *record_batch, + GError **error); + GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, GArrowTable *table, - guint64 chunk_size, + gsize chunk_size, GError **error); +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, + gsize chunk_size, + GError **error); + +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer, + GArrowChunkedArray *chunked_array, + GError **error); + GPARQUET_AVAILABLE_IN_0_11 gboolean gparquet_arrow_file_writer_close(GParquetArrowFileWriter *writer, GError **error); diff --git a/c_glib/test/flight/test-client.rb b/c_glib/test/flight/test-client.rb index 7eb093d3cab80..f1e3f31234ab4 100644 --- a/c_glib/test/flight/test-client.rb +++ b/c_glib/test/flight/test-client.rb @@ -84,4 +84,37 @@ def test_error end end end + + sub_test_case("#do_put") do + def test_success + client = ArrowFlight::Client.new(@location) + generator = Helper::FlightInfoGenerator.new + descriptor = generator.page_view_descriptor + table = generator.page_view_table + result = client.do_put(descriptor, table.schema) + writer = result.writer + writer.write_table(table) + writer.done_writing + reader = result.reader + metadata = reader.read + writer.close + assert_equal(["done", table], + [metadata.data.to_s, @server.uploaded_table]) + end + + def test_error + client = ArrowFlight::Client.new(@location) + generator = Helper::FlightInfoGenerator.new + descriptor = generator.page_view_descriptor + table = generator.page_view_table + result = client.do_put(descriptor, table.schema) + assert_raise(Arrow::Error::Invalid) do + writer = result.writer + writer.done_writing + reader = result.reader + reader.read + writer.close + end + end + end end diff --git a/c_glib/test/helper/flight-server.rb b/c_glib/test/helper/flight-server.rb index 8c47029d41791..80b8a5c96cf9f 100644 --- a/c_glib/test/helper/flight-server.rb +++ b/c_glib/test/helper/flight-server.rb @@ -34,6 +34,8 @@ def virtual_do_is_valid(context, token) class FlightServer < ArrowFlight::Server type_register + attr_reader :uploaded_table + private def virtual_do_list_flights(context, criteria) generator = FlightInfoGenerator.new @@ -54,5 +56,14 @@ def virtual_do_do_get(context, ticket) reader = Arrow::TableBatchReader.new(table) ArrowFlight::RecordBatchStream.new(reader) end + + def virtual_do_do_put(context, reader, writer) + @uploaded_table = reader.read_all + writer.write(Arrow::Buffer.new("done")) + if @uploaded_table.n_rows.zero? + raise Arrow::Error::Invalid.new("empty table") + end + true + end end end diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb index f899e7273b2a2..89db16c6fb90b 100644 --- a/c_glib/test/parquet/test-arrow-file-writer.rb +++ b/c_glib/test/parquet/test-arrow-file-writer.rb @@ -26,7 +26,39 @@ def setup end end - def test_write + def test_schema + schema = build_schema("enabled" => :boolean) + writer = Parquet::ArrowFileWriter.new(schema, @file.path) + assert_equal(schema, writer.schema) + writer.close + end + + def test_write_record_batch + enabled_values = [true, nil, false, true] + record_batch = + build_record_batch("enabled" => build_boolean_array(enabled_values)) + + writer = Parquet::ArrowFileWriter.new(record_batch.schema, @file.path) + writer.write_record_batch(record_batch) + writer.close + + reader = Parquet::ArrowFileReader.new(@file.path) + begin + reader.use_threads = true + assert_equal([ + 1, + Arrow::Table.new(record_batch.schema, [record_batch]), + ], + [ + reader.n_row_groups, + reader.read_table, + ]) + ensure + reader.unref + end + end + + def test_write_table enabled_values = [true, nil, false, true] table = build_table("enabled" => build_boolean_array(enabled_values)) chunk_size = 2 @@ -40,11 +72,41 @@ def test_write reader.use_threads = true assert_equal([ enabled_values.length / chunk_size, - true, + table, + ], + [ + reader.n_row_groups, + reader.read_table, + ]) + ensure + reader.unref + end + end + + def test_write_chunked_array + schema = build_schema("enabled" => :boolean) + writer = Parquet::ArrowFileWriter.new(schema, @file.path) + writer.new_row_group(2) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([true, nil])]) + writer.write_chunked_array(chunked_array) + writer.new_row_group(1) + chunked_array = Arrow::ChunkedArray.new([build_boolean_array([false])]) + writer.write_chunked_array(chunked_array) + writer.close + + reader = Parquet::ArrowFileReader.new(@file.path) + begin + reader.use_threads = true + assert_equal([ + 2, + build_table("enabled" => [ + build_boolean_array([true, nil]), + build_boolean_array([false]), + ]), ], [ reader.n_row_groups, - table.equal_metadata(reader.read_table, false), + reader.read_table, ]) ensure reader.unref diff --git a/c_glib/test/test-file-writer.rb b/c_glib/test/test-file-writer.rb index 5f9c3c4e19aa9..06c9dfa25c7fc 100644 --- a/c_glib/test/test-file-writer.rb +++ b/c_glib/test/test-file-writer.rb @@ -34,6 +34,9 @@ def test_write_record_batch file_writer.write_record_batch(record_batch) ensure file_writer.close + assert do + file_writer.closed? + end end ensure output.close @@ -68,6 +71,9 @@ def test_write_table file_writer.write_table(table) ensure file_writer.close + assert do + file_writer.closed? + end end ensure output.close diff --git a/c_glib/test/test-stream-writer.rb b/c_glib/test/test-stream-writer.rb index 32754e20838b4..261732ae91e15 100644 --- a/c_glib/test/test-stream-writer.rb +++ b/c_glib/test/test-stream-writer.rb @@ -35,6 +35,9 @@ def test_write_record_batch stream_writer.write_record_batch(record_batch) ensure stream_writer.close + assert do + stream_writer.closed? + end end ensure output.close diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index f688fbb63a9ad..08a052e82f24d 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON set ARROW_CXXFLAGS=/WX /MP @rem Install GCS testbench +set PIPX_BIN_DIR=C:\Windows\ call %CD%\ci\scripts\install_gcs_testbench.bat +storage-testbench -h || exit /B @rem @rem Build and test Arrow C++ libraries (including Parquet) diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index dff1f2224809a..f0084894e19dc 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -42,17 +42,19 @@ RUN mamba install -q -y \ valgrind && \ mamba clean --all +# We want to install the GCS testbench using the Conda base environment's Python, +# because the test environment's Python may later change. +ENV PIPX_BASE_PYTHON=/opt/conda/bin/python3 +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + # Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to -# be on the path for the tests to run. +# be on the path for the tests to run. ENV PATH=/opt/conda/envs/arrow/bin:$PATH COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_azurite.sh -# We want to install the GCS testbench using the same Python binary that the Conda code will use. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index c602490d6b729..7ad2e5c0e8008 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -24,7 +24,7 @@ ARG maven=3.8.7 ARG node=16 ARG yarn=1.22 ARG jdk=11 -ARG go=1.21.8 +ARG go=1.22.6 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 027fd589cecca..7e8dbe76f6248 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -32,11 +32,6 @@ RUN mamba install -q -y \ nomkl && \ mamba clean --all -# XXX The GCS testbench was already installed in conda-cpp.dockerfile, -# but we changed the installed Python version above, so we need to reinstall it. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - ENV ARROW_ACERO=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/debian-12-go.dockerfile index c958e6bdee211..4bc683c109eb8 100644 --- a/ci/docker/debian-12-go.dockerfile +++ b/ci/docker/debian-12-go.dockerfile @@ -16,8 +16,8 @@ # under the License. ARG arch=amd64 -ARG go=1.21 -ARG staticcheck=v0.4.7 +ARG go=1.22 +ARG staticcheck=v0.5.1 FROM ${arch}/golang:${go}-bookworm # FROM collects all the args, get back the staticcheck version arg diff --git a/ci/docker/fedora-39-cpp.dockerfile b/ci/docker/fedora-39-cpp.dockerfile index 33d11823094ce..2ac5afe7b91f6 100644 --- a/ci/docker/fedora-39-cpp.dockerfile +++ b/ci/docker/fedora-39-cpp.dockerfile @@ -34,6 +34,7 @@ RUN dnf update -y && \ curl-devel \ gcc \ gcc-c++ \ + gdb \ gflags-devel \ git \ glog-devel \ diff --git a/ci/docker/linux-apt-python-313-freethreading.dockerfile b/ci/docker/linux-apt-python-313-freethreading.dockerfile new file mode 100644 index 0000000000000..f5505e67f00bb --- /dev/null +++ b/ci/docker/linux-apt-python-313-freethreading.dockerfile @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG base +FROM ${base} + +RUN apt-get update -y -q && \ + apt install -y -q --no-install-recommends software-properties-common gpg-agent && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update -y -q && \ + apt install -y -q --no-install-recommends python3.13-dev python3.13-nogil python3.13-venv && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + +COPY python/requirements-build.txt \ + python/requirements-test.txt \ + /arrow/python/ + +ENV ARROW_PYTHON_VENV /arrow-dev +RUN python3.13t -m venv ${ARROW_PYTHON_VENV} +RUN ${ARROW_PYTHON_VENV}/bin/python -m pip install -U pip setuptools wheel +RUN ${ARROW_PYTHON_VENV}/bin/python -m pip install \ + --pre \ + --prefer-binary \ + --extra-index-url "https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" \ + -r arrow/python/requirements-build.txt \ + -r arrow/python/requirements-test.txt + +# We want to run the PyArrow test suite with the GIL disabled, but cffi +# (more precisely, the `_cffi_backend` module) currently doesn't declare +# itself safe to run without the GIL. +# Therefore set PYTHON_GIL to 0. +ENV ARROW_ACERO=ON \ + ARROW_BUILD_STATIC=OFF \ + ARROW_BUILD_TESTS=OFF \ + ARROW_BUILD_UTILITIES=OFF \ + ARROW_COMPUTE=ON \ + ARROW_CSV=ON \ + ARROW_DATASET=ON \ + ARROW_FILESYSTEM=ON \ + ARROW_GDB=ON \ + ARROW_HDFS=ON \ + ARROW_JSON=ON \ + ARROW_USE_GLOG=OFF \ + PYTHON_GIL=0 diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile index 443ff9c53cbcb..09883f9780a36 100644 --- a/ci/docker/python-wheel-manylinux-test.dockerfile +++ b/ci/docker/python-wheel-manylinux-test.dockerfile @@ -19,13 +19,19 @@ ARG arch ARG python_image_tag FROM ${arch}/python:${python_image_tag} -# RUN pip install --upgrade pip - # pandas doesn't provide wheel for aarch64 yet, so cache the compiled # test dependencies in a docker image COPY python/requirements-wheel-test.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-test.txt +# Install the GCS testbench with the system Python +RUN apt-get update -y -q && \ + apt-get install -y -q \ + build-essential \ + python3-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -ARG python -RUN PYTHON_VERSION=${python} /arrow/ci/scripts/install_gcs_testbench.sh default +ENV PIPX_PYTHON=/usr/bin/python3 PIPX_PIP_ARGS=--prefer-binary +RUN /arrow/ci/scripts/install_gcs_testbench.sh default diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 42f088fd8a22a..5cc1711608c03 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -100,6 +100,9 @@ RUN vcpkg install \ --x-feature=parquet \ --x-feature=s3 +# Make sure auditwheel is up-to-date +RUN pipx upgrade auditwheel + # Configure Python for applications running in the bash shell of this Dockerfile ARG python=3.8 ENV PYTHON_VERSION=${python} diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile index 5f488a4c285ff..bffc1bd13d6b7 100644 --- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -27,24 +27,38 @@ FROM abrarov/msvc-2019:2.11.0 # Add unix tools to path RUN setx path "%path%;C:\Program Files\Git\usr\bin" -# Remove previous installations of python from the base image +# 1. Remove previous installations of python from the base image # NOTE: a more recent base image (tried with 2.12.1) comes with python 3.9.7 # and the msi installers are failing to remove pip and tcl/tk "products" making # the subsequent choco python installation step failing for installing python # version 3.9.* due to existing python version +# 2. Install Minio for S3 testing. RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ - rm -rf Python* + rm -rf Python* && \ + curl https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z \ + --output "C:\Windows\Minio.exe" + +# Install the GCS testbench using a well-known Python version. +# NOTE: cannot use pipx's `--fetch-missing-python` because of +# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves. +RUN choco install -r -y --pre --no-progress python --version=3.11.9 +ENV PIPX_BIN_DIR=C:\\Windows\\ +ENV PIPX_PYTHON="C:\Python311\python.exe" +COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/ +RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \ + storage-testbench -h # Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0) ARG python=3.8 -RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \ - (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ - (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ - (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \ - (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts") +RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \ + (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \ + (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \ + (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \ + (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \ + (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1") # Install archiver to extract xz archives -RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \ - python -m pip install --no-cache-dir -U pip setuptools & \ +RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% && \ choco install --no-progress -r -y archiver + +ENV PYTHON=$python diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index e17c0306f115d..1b342df596c9d 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -29,10 +29,12 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index 341d8a87e8661..ce31c457e909e 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -29,10 +29,12 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile index a995ab2a8bc2d..a1fd178a2c754 100644 --- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile @@ -29,10 +29,12 @@ RUN apt-get update -y -q && \ ccache \ cmake \ curl \ + gdb \ git \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ tzdata-legacy \ wget && \ diff --git a/ci/scripts/install_gcs_testbench.bat b/ci/scripts/install_gcs_testbench.bat index b03d0c2ad6608..f54f98db7cac8 100644 --- a/ci/scripts/install_gcs_testbench.bat +++ b/ci/scripts/install_gcs_testbench.bat @@ -17,9 +17,18 @@ @echo on -set GCS_TESTBENCH_VERSION="v0.36.0" +set GCS_TESTBENCH_VERSION="v0.40.0" + +set PIPX_FLAGS=--verbose +if NOT "%PIPX_PYTHON%"=="" ( + set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS% +) + +python -m pip install -U pipx || exit /B 1 @REM Install GCS testbench %GCS_TESTBENCH_VERSION% -python -m pip install ^ +pipx install %PIPX_FLAGS% ^ "https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz" ^ || exit /B 1 + +pipx list --verbose diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 5471b3cc238ca..48a5858a358c9 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -e +set -ex if [ "$#" -ne 1 ]; then echo "Usage: $0 " @@ -34,19 +34,26 @@ case "$(uname -m)" in ;; esac -# On newer pythons install into the system will fail, so override that -export PIP_BREAK_SYSTEM_PACKAGES=1 - version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.39.0" - # Latests versions of Testbench require newer setuptools - python3 -m pip install --upgrade setuptools fi -# This script is run with PYTHON undefined in some places, -# but those only use older pythons. -if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then - python3 -m pip install \ - "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" +# The Python to install pipx with +: ${PIPX_BASE_PYTHON:=$(which python3)} +# The Python to install the GCS testbench with +: ${PIPX_PYTHON:=${PIPX_BASE_PYTHON:-$(which python3)}} + +export PIP_BREAK_SYSTEM_PACKAGES=1 +${PIPX_BASE_PYTHON} -m pip install -U pipx + +pipx_flags=(--verbose --python ${PIPX_PYTHON}) +if [[ $(id -un) == "root" ]]; then + # Install globally as /root/.local/bin is typically not in $PATH + pipx_flags+=(--global) +fi +if [[ -n "${PIPX_PIP_ARGS}" ]]; then + pipx_flags+=(--pip-args "'${PIPX_PIP_ARGS}'") fi +${PIPX_BASE_PYTHON} -m pipx install ${pipx_flags[@]} \ + "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index 2eb58e8dc75ec..3050ad3111198 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -ex +set -e arrow_dir=${1} build_dir=${2} @@ -30,8 +30,13 @@ gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration : ${ARROW_INTEGRATION_JAVA:=ON} : ${ARROW_INTEGRATION_JS:=ON} +. ${arrow_dir}/ci/scripts/util_log.sh + +github_actions_group_begin "Integration: Prepare: Archery" pip install -e $arrow_dir/dev/archery[integration] +github_actions_group_end +github_actions_group_begin "Integration: Prepare: Dependencies" # For C Data Interface testing if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then pip install pythonnet @@ -39,6 +44,7 @@ fi if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then pip install jpype1 fi +github_actions_group_end export ARROW_BUILD_ROOT=${build_dir} diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index 9b54049a2b803..8fca0d434b75e 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -ex +set -e arrow_dir=${1} build_dir=${2} @@ -28,22 +28,35 @@ build_dir=${2} : ${ARROW_INTEGRATION_JAVA:=ON} : ${ARROW_INTEGRATION_JS:=ON} +. ${arrow_dir}/ci/scripts/util_log.sh + +github_actions_group_begin "Integration: Build: Rust" ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +github_actions_group_end +github_actions_group_begin "Integration: Build: nanoarrow" ${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} +github_actions_group_end +github_actions_group_begin "Integration: Build: C++" if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} fi +github_actions_group_end +github_actions_group_begin "Integration: Build: C#" if [ "${ARROW_INTEGRATION_CSHARP}" == "ON" ]; then ${arrow_dir}/ci/scripts/csharp_build.sh ${arrow_dir} ${build_dir} fi +github_actions_group_end +github_actions_group_begin "Integration: Build: Go" if [ "${ARROW_INTEGRATION_GO}" == "ON" ]; then ${arrow_dir}/ci/scripts/go_build.sh ${arrow_dir} ${build_dir} fi +github_actions_group_end +github_actions_group_begin "Integration: Build: Java" if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then export ARROW_JAVA_CDATA="ON" export JAVA_JNI_CMAKE_ARGS="-DARROW_JAVA_JNI_ENABLE_DEFAULT=OFF -DARROW_JAVA_JNI_ENABLE_C=ON" @@ -51,7 +64,10 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then ${arrow_dir}/ci/scripts/java_jni_build.sh ${arrow_dir} ${ARROW_HOME} ${build_dir} /tmp/dist/java ${arrow_dir}/ci/scripts/java_build.sh ${arrow_dir} ${build_dir} /tmp/dist/java fi +github_actions_group_end +github_actions_group_begin "Integration: Build: JavaScript" if [ "${ARROW_INTEGRATION_JS}" == "ON" ]; then ${arrow_dir}/ci/scripts/js_build.sh ${arrow_dir} ${build_dir} fi +github_actions_group_end diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index d5430f26748eb..d2c392e6b9db3 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -34,7 +34,7 @@ rm -rf ${source_dir}/python/pyarrow/*.so.* echo "=== (${PYTHON_VERSION}) Set SDK, C++ and Wheel flags ===" export _PYTHON_HOST_PLATFORM="macosx-${MACOSX_DEPLOYMENT_TARGET}-${arch}" -export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.15} +export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-12.0} export SDKROOT=${SDKROOT:-$(xcrun --sdk macosx --show-sdk-path)} if [ $arch = "arm64" ]; then @@ -150,7 +150,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index aa86494a9d47d..885019ff3049f 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -140,7 +140,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} @@ -181,5 +180,5 @@ popd rm -rf dist/temp-fix-wheel echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ===" -auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels +auditwheel repair dist/pyarrow-*.whl -w repaired_wheels popd diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index cf87a17056783..6bdc3d3621e14 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -34,6 +34,7 @@ source_dir=${1} : ${ARROW_S3:=ON} : ${ARROW_SUBSTRAIT:=ON} : ${CHECK_IMPORTS:=ON} +: ${CHECK_WHEEL_CONTENT:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} @@ -87,6 +88,11 @@ import pyarrow.parquet fi fi +if [ "${CHECK_WHEEL_CONTENT}" == "ON" ]; then + python ${source_dir}/ci/scripts/python_wheel_validate_contents.py \ + --path ${source_dir}/python/repaired_wheels +fi + if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Install testing dependencies pip install -U -r ${source_dir}/python/requirements-wheel-test.txt diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py new file mode 100644 index 0000000000000..22b3a890f036b --- /dev/null +++ b/ci/scripts/python_wheel_validate_contents.py @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +from pathlib import Path +import re +import zipfile + + +def validate_wheel(path): + p = Path(path) + wheels = list(p.glob('*.whl')) + error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" + assert len(wheels) == 1, error_msg + f = zipfile.ZipFile(wheels[0]) + outliers = [ + info.filename for info in f.filelist if not re.match( + r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/)', info.filename + ) + ] + assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + print(f"The wheel: {wheels[0]} seems valid.") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--path", type=str, required=True, + help="Directory where wheel is located") + args = parser.parse_args() + validate_wheel(args.path) + + +if __name__ == '__main__': + main() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 54f02ec6f6ed0..1f1d5dca721d9 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -106,7 +106,6 @@ echo "=== (%PYTHON_VERSION%) Building wheel ===" set PYARROW_BUILD_TYPE=%CMAKE_BUILD_TYPE% set PYARROW_BUNDLE_ARROW_CPP=ON set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR% -set PYARROW_INSTALL_TESTS=ON set PYARROW_WITH_ACERO=%ARROW_ACERO% set PYARROW_WITH_DATASET=%ARROW_DATASET% set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT% diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index 87c0bb1252024..ae5b7e36ad7ab 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -28,7 +28,7 @@ set PYARROW_TEST_ORC=ON set PYARROW_TEST_PARQUET=ON set PYARROW_TEST_PARQUET_ENCRYPTION=ON set PYARROW_TEST_SUBSTRAIT=ON -set PYARROW_TEST_S3=OFF +set PYARROW_TEST_S3=ON set PYARROW_TEST_TENSORFLOW=ON @REM Enable again once https://github.com/scipy/oldest-supported-numpy/pull/27 gets merged @@ -37,28 +37,35 @@ set PYARROW_TEST_TENSORFLOW=ON set ARROW_TEST_DATA=C:\arrow\testing\data set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data -@REM Install testing dependencies -pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 +@REM List installed Pythons +py -0p + +set PYTHON_CMD=py -%PYTHON% -@REM Install GCS testbench -call "C:\arrow\ci\scripts\install_gcs_testbench.bat" +%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1 + +@REM Install testing dependencies +%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 @REM Install the built wheels -python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 +%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 @REM Test that the modules are importable -python -c "import pyarrow" || exit /B 1 -python -c "import pyarrow._gcsfs" || exit /B 1 -python -c "import pyarrow._hdfs" || exit /B 1 -python -c "import pyarrow._s3fs" || exit /B 1 -python -c "import pyarrow.csv" || exit /B 1 -python -c "import pyarrow.dataset" || exit /B 1 -python -c "import pyarrow.flight" || exit /B 1 -python -c "import pyarrow.fs" || exit /B 1 -python -c "import pyarrow.json" || exit /B 1 -python -c "import pyarrow.orc" || exit /B 1 -python -c "import pyarrow.parquet" || exit /B 1 -python -c "import pyarrow.substrait" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1 + +@REM Validate wheel contents +%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\dist || exit /B 1 @rem Download IANA Timezone Database for ORC C++ curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B @@ -67,4 +74,4 @@ arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo @REM Execute unittest -pytest -r s --pyargs pyarrow || exit /B 1 +%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1 diff --git a/ci/scripts/util_enable_core_dumps.sh b/ci/scripts/util_enable_core_dumps.sh new file mode 100644 index 0000000000000..09f8d2d727099 --- /dev/null +++ b/ci/scripts/util_enable_core_dumps.sh @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# NOTE: this script is not marked executable as it should be source'd +# for `ulimit` to take effect. + +set -e + +platform=$(uname) + +if [ "${platform}" = "Linux" ]; then + # We need to override `core_pattern` because + # 1. the original setting may reference apport, which is not available under + # most Docker containers; + # 2. we want to write the core file in a well-known directory. + sudo sysctl -w kernel.core_pattern="/tmp/core.%e.%p" +fi + +ulimit -c unlimited diff --git a/ci/scripts/util_log.sh b/ci/scripts/util_log.sh new file mode 100644 index 0000000000000..b34c44059adb2 --- /dev/null +++ b/ci/scripts/util_log.sh @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +github_actions_group_begin() { + echo "::group::$1" + set -x +} + +github_actions_group_end() { + set +x + echo "::endgroup::" +} diff --git a/ci/vcpkg/arm64-osx-static-debug.cmake b/ci/vcpkg/arm64-osx-static-debug.cmake index f511819a2edd9..32ae7bc433489 100644 --- a/ci/vcpkg/arm64-osx-static-debug.cmake +++ b/ci/vcpkg/arm64-osx-static-debug.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES arm64) -set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/arm64-osx-static-release.cmake b/ci/vcpkg/arm64-osx-static-release.cmake index 43d65efb2651b..dde46cd763afe 100644 --- a/ci/vcpkg/arm64-osx-static-release.cmake +++ b/ci/vcpkg/arm64-osx-static-release.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES arm64) -set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE release) diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake index 8abc1ebf838f1..d3ef0d67eb719 100644 --- a/ci/vcpkg/universal2-osx-static-debug.cmake +++ b/ci/vcpkg/universal2-osx-static-debug.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake index 2eb36c15175b2..3018aa93e5fbb 100644 --- a/ci/vcpkg/universal2-osx-static-release.cmake +++ b/ci/vcpkg/universal2-osx-static-release.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE release) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5ead9e4b063cd..423744c388471 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -301,7 +301,8 @@ add_custom_target(lint --cpplint_binary ${CPPLINT_BIN} ${COMMON_LINT_OPTIONS} - ${ARROW_LINT_QUIET}) + ${ARROW_LINT_QUIET} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) # # "make format" and "make check-format" targets diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 8e42438a23c1c..55e3fe0980749 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -121,12 +121,15 @@ function print_coredumps() { # patterns must be set with prefix `core.{test-executable}*`: # # In case of macOS: - # sudo sysctl -w kern.corefile=core.%N.%P + # sudo sysctl -w kern.corefile=/tmp/core.%N.%P # On Linux: - # sudo sysctl -w kernel.core_pattern=core.%e.%p + # sudo sysctl -w kernel.core_pattern=/tmp/core.%e.%p # # and the ulimit must be increased: # ulimit -c unlimited + # + # If the tests are run in a Docker container, the instructions are slightly + # different: see the 'Coredumps' comment section in `docker-compose.yml`. # filename is truncated to the first 15 characters in case of linux, so limit # the pattern for the first 15 characters @@ -134,19 +137,21 @@ function print_coredumps() { FILENAME=$(echo ${FILENAME} | cut -c-15) PATTERN="^core\.${FILENAME}" - COREFILES=$(ls | grep $PATTERN) + COREFILES=$(ls /tmp | grep $PATTERN) if [ -n "$COREFILES" ]; then - echo "Found core dump, printing backtrace:" - for COREFILE in $COREFILES; do + COREPATH="/tmp/${COREFILE}" + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + echo "Running '${TEST_EXECUTABLE}' produced core dump at '${COREPATH}', printing backtrace:" # Print backtrace if [ "$(uname)" == "Darwin" ]; then - lldb -c "${COREFILE}" --batch --one-line "thread backtrace all -e true" + lldb -c "${COREPATH}" --batch --one-line "thread backtrace all -e true" else - gdb -c "${COREFILE}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch + gdb -c "${COREPATH}" $TEST_EXECUTABLE -ex "thread apply all bt" -ex "set pagination 0" -batch fi - # Remove the coredump, regenerate it via running the test case directly - rm "${COREFILE}" + echo "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + # Remove the coredump, it can be regenerated via running the test case directly + rm "${COREPATH}" done fi } diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py index 76c0fe0aefaca..a81acf2eb2ff9 100755 --- a/cpp/build-support/run_cpplint.py +++ b/cpp/build-support/run_cpplint.py @@ -26,24 +26,6 @@ from functools import partial -# NOTE(wesm): -# -# * readability/casting is disabled as it aggressively warns about functions -# with names like "int32", so "int32(x)", where int32 is a function name, -# warns with -_filters = ''' --whitespace/comments --readability/casting --readability/todo --readability/alt_tokens --build/header_guard --build/c++11 --build/include_what_you_use --runtime/references --build/include_order -'''.split() - - def _get_chunk_key(filenames): # lists are not hashable so key on the first filename in a chunk return filenames[0] @@ -87,8 +69,6 @@ def _check_some_files(completed_processes, filenames): cmd = [ arguments.cpplint_binary, '--verbose=2', - '--linelength=90', - '--filter=' + ','.join(_filters) ] if (arguments.cpplint_binary.endswith('.py') and platform.system() == 'Windows'): diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index 41466a1c22404..755887314d110 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -303,7 +303,10 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_IPC) define_option(ARROW_AZURE - "Build Arrow with Azure support (requires the Azure SDK for C++)" OFF) + "Build Arrow with Azure support (requires the Azure SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_BUILD_UTILITIES "Build Arrow commandline utilities" OFF) @@ -346,9 +349,16 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_WITH_UTF8PROC) define_option(ARROW_GCS - "Build Arrow with GCS support (requires the GCloud SDK for C++)" OFF) + "Build Arrow with GCS support (requires the GCloud SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) - define_option(ARROW_HDFS "Build the Arrow HDFS bridge" OFF) + define_option(ARROW_HDFS + "Build the Arrow HDFS bridge" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_IPC "Build the Arrow IPC extensions" ON) @@ -398,7 +408,11 @@ takes precedence over ccache if a storage backend is configured" ON) ARROW_HDFS ARROW_JSON) - define_option(ARROW_S3 "Build Arrow with S3 support (requires the AWS SDK for C++)" OFF) + define_option(ARROW_S3 + "Build Arrow with S3 support (requires the AWS SDK for C++)" + OFF + DEPENDS + ARROW_FILESYSTEM) define_option(ARROW_SKYHOOK "Build the Skyhook libraries" diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 63e2c036c9a6f..b31037a973279 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -259,7 +259,7 @@ macro(resolve_dependency DEPENDENCY_NAME) IS_RUNTIME_DEPENDENCY REQUIRED_VERSION USE_CONFIG) - set(multi_value_args COMPONENTS PC_PACKAGE_NAMES) + set(multi_value_args COMPONENTS OPTIONAL_COMPONENTS PC_PACKAGE_NAMES) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -287,6 +287,9 @@ macro(resolve_dependency DEPENDENCY_NAME) if(ARG_COMPONENTS) list(APPEND FIND_PACKAGE_ARGUMENTS COMPONENTS ${ARG_COMPONENTS}) endif() + if(ARG_OPTIONAL_COMPONENTS) + list(APPEND FIND_PACKAGE_ARGUMENTS OPTIONAL_COMPONENTS ${ARG_OPTIONAL_COMPONENTS}) + endif() if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") find_package(${FIND_PACKAGE_ARGUMENTS}) set(COMPATIBLE ${${PACKAGE_NAME}_FOUND}) @@ -1289,15 +1292,19 @@ if(ARROW_USE_BOOST) set(Boost_USE_STATIC_LIBS ON) endif() if(ARROW_BOOST_REQUIRE_LIBRARY) - set(ARROW_BOOST_COMPONENTS system filesystem) + set(ARROW_BOOST_COMPONENTS filesystem system) + set(ARROW_BOOST_OPTIONAL_COMPONENTS process) else() set(ARROW_BOOST_COMPONENTS) + set(ARROW_BOOST_OPTIONAL_COMPONENTS) endif() resolve_dependency(Boost REQUIRED_VERSION ${ARROW_BOOST_REQUIRED_VERSION} COMPONENTS ${ARROW_BOOST_COMPONENTS} + OPTIONAL_COMPONENTS + ${ARROW_BOOST_OPTIONAL_COMPONENTS} IS_RUNTIME_DEPENDENCY # libarrow.so doesn't depend on libboost*. FALSE) @@ -1316,14 +1323,35 @@ if(ARROW_USE_BOOST) endif() endforeach() - if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # boost/process/detail/windows/handle_workaround.hpp doesn't work - # without BOOST_USE_WINDOWS_H with MinGW because MinGW doesn't - # provide __kernel_entry without winternl.h. - # - # See also: - # https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp - target_compile_definitions(Boost::headers INTERFACE "BOOST_USE_WINDOWS_H=1") + if(TARGET Boost::process) + # Boost >= 1.86 + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V1") + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") + else() + # Boost < 1.86 + add_library(Boost::process INTERFACE IMPORTED) + if(TARGET Boost::filesystem) + target_link_libraries(Boost::process INTERFACE Boost::filesystem) + endif() + if(TARGET Boost::system) + target_link_libraries(Boost::process INTERFACE Boost::system) + endif() + if(TARGET Boost::headers) + target_link_libraries(Boost::process INTERFACE Boost::headers) + endif() + if(Boost_VERSION VERSION_GREATER_EQUAL 1.80) + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") + # Boost < 1.86 has a bug that + # boost::process::v2::process_environment::on_setup() isn't + # defined. We need to build Boost Process source to define it. + # + # See also: + # https://github.com/boostorg/process/issues/312 + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_NEED_SOURCE") + if(WIN32) + target_link_libraries(Boost::process INTERFACE bcrypt ntdll) + endif() + endif() endif() message(STATUS "Boost include dir: ${Boost_INCLUDE_DIRS}") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 89f28ee416ede..e77a02d0c0800 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -373,8 +373,11 @@ set(ARROW_SRCS config.cc datum.cc device.cc + device_allocation_type_set.cc extension_type.cc extension/bool8.cc + extension/json.cc + extension/uuid.cc pretty_print.cc record_batch.cc result.cc @@ -642,9 +645,13 @@ else() endif() set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST}) -set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON) -set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static - ${ARROW_GTEST_GTEST}) +set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON Boost::process) +set(ARROW_TESTING_STATIC_LINK_LIBS + arrow::flatbuffers + RapidJSON + Boost::process + arrow_static + ${ARROW_GTEST_GTEST}) set(ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared) set(ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static) # that depend on gtest @@ -665,9 +672,10 @@ set(ARROW_TESTING_SRCS io/test_common.cc ipc/test_common.cc testing/fixed_width_test_util.cc + testing/generator.cc testing/gtest_util.cc + testing/process.cc testing/random.cc - testing/generator.cc testing/util.cc) # @@ -1225,6 +1233,7 @@ add_subdirectory(testing) add_subdirectory(array) add_subdirectory(c) add_subdirectory(compute) +add_subdirectory(extension) add_subdirectory(io) add_subdirectory(tensor) add_subdirectory(util) @@ -1267,7 +1276,6 @@ endif() if(ARROW_JSON) add_subdirectory(json) - add_subdirectory(extension) endif() if(ARROW_ORC) diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc index 854862e3e48ca..c0dfba66336af 100644 --- a/cpp/src/arrow/acero/aggregate_benchmark.cc +++ b/cpp/src/arrow/acero/aggregate_benchmark.cc @@ -165,11 +165,11 @@ struct SumSentinelUnrolled : public Summer { static void Sum(const ArrayType& array, SumState* state) { SumState local; -#define SUM_NOT_NULL(ITEM) \ - do { \ - local.total += values[i + ITEM] * Traits::NotNull(values[i + ITEM]); \ - local.valid_count++; \ - } while (0) +# define SUM_NOT_NULL(ITEM) \ + do { \ + local.total += values[i + ITEM] * Traits::NotNull(values[i + ITEM]); \ + local.valid_count++; \ + } while (0) const auto values = array.raw_values(); const auto length = array.length(); @@ -185,7 +185,7 @@ struct SumSentinelUnrolled : public Summer { SUM_NOT_NULL(7); } -#undef SUM_NOT_NULL +# undef SUM_NOT_NULL for (int64_t i = length_rounded * 8; i < length; ++i) { local.total += values[i] * Traits::NotNull(values[i]); @@ -256,7 +256,7 @@ struct SumBitmapVectorizeUnroll : public Summer { for (int64_t i = 0; i < length_rounded; i += 8) { const uint8_t valid_byte = bitmap[i / 8]; -#define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1)) +# define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1)) if (valid_byte < 0xFF) { // Some nulls @@ -277,7 +277,7 @@ struct SumBitmapVectorizeUnroll : public Summer { } } -#undef SUM_SHIFT +# undef SUM_SHIFT for (int64_t i = length_rounded; i < length; ++i) { if (bit_util::GetBit(bitmap, i)) { diff --git a/cpp/src/arrow/acero/aggregate_node_test.cc b/cpp/src/arrow/acero/aggregate_node_test.cc index d398fb24b73d5..c623271db9fb4 100644 --- a/cpp/src/arrow/acero/aggregate_node_test.cc +++ b/cpp/src/arrow/acero/aggregate_node_test.cc @@ -210,5 +210,57 @@ TEST(GroupByNode, NoSkipNulls) { AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, out_batches.batches); } +TEST(ScalarAggregateNode, AnyAll) { + // GH-43768: boolean_any and boolean_all with constant input should work well + // when min_count != 0. + std::shared_ptr in_schema = schema({field("not_used", int32())}); + std::shared_ptr out_schema = schema({field("agg_out", boolean())}); + struct AnyAllCase { + std::string batches_json; + Expression literal; + std::string expected_json; + bool skip_nulls = false; + uint32_t min_count = 2; + }; + std::vector cases{ + {"[[42], [42], [42], [42]]", literal(true), "[[true]]"}, + {"[[42], [42], [42], [42]]", literal(false), "[[false]]"}, + {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]"}, + {"[[42]]", literal(true), "[[null]]"}, + {"[[42], [42], [42]]", literal(true), "[[true]]"}, + {"[[42], [42], [42]]", literal(true), "[[null]]", /*skip_nulls=*/false, + /*min_count=*/4}, + {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]", + /*skip_nulls=*/true}, + }; + for (const AnyAllCase& any_all_case : cases) { + for (auto func_name : {"any", "all"}) { + std::vector batches{ + ExecBatchFromJSON({int32()}, any_all_case.batches_json)}; + std::vector aggregates = { + Aggregate(func_name, + std::make_shared( + /*skip_nulls=*/any_all_case.skip_nulls, + /*min_count=*/any_all_case.min_count), + FieldRef("literal"))}; + + // And a projection to make the input including a Scalar Boolean + Declaration plan = Declaration::Sequence( + {{"exec_batch_source", ExecBatchSourceNodeOptions(in_schema, batches)}, + {"project", ProjectNodeOptions({any_all_case.literal}, {"literal"})}, + {"aggregate", AggregateNodeOptions(aggregates)}}); + + ASSERT_OK_AND_ASSIGN(BatchesWithCommonSchema out_batches, + DeclarationToExecBatches(plan)); + + ExecBatch expected_batch = + ExecBatchFromJSON({boolean()}, any_all_case.expected_json); + + AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, + out_batches.batches); + } + } +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 2248362241cd7..c4f11d01f3d5c 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -34,7 +34,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/unmaterialized_table_internal.h" #ifndef NDEBUG -#include "arrow/acero/options_internal.h" +# include "arrow/acero/options_internal.h" #endif #include "arrow/acero/query_context.h" #include "arrow/acero/schema_util.h" @@ -42,7 +42,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #ifndef NDEBUG -#include "arrow/compute/function_internal.h" +# include "arrow/compute/function_internal.h" #endif #include "arrow/acero/time_series_util.h" #include "arrow/compute/key_hash_internal.h" @@ -207,16 +207,16 @@ class DebugSync { std::unique_lock debug_lock_; }; -#define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__) -#define DEBUG_MANIP(manip) \ - DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; }) -#define NDEBUG_EXPLICIT -#define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__ +# define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__) +# define DEBUG_MANIP(manip) \ + DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; }) +# define NDEBUG_EXPLICIT +# define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__ #else -#define DEBUG_SYNC(...) -#define DEBUG_MANIP(...) -#define NDEBUG_EXPLICIT explicit -#define DEBUG_ADD(ndebug, ...) ndebug +# define DEBUG_SYNC(...) +# define DEBUG_MANIP(...) +# define NDEBUG_EXPLICIT explicit +# define DEBUG_ADD(ndebug, ...) ndebug #endif struct MemoStore { diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index 555f580028fac..5d3e9fba08bbf 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -26,13 +26,13 @@ #include "arrow/acero/exec_plan.h" #include "arrow/testing/future_util.h" #ifndef NDEBUG -#include +# include #endif #include #include "arrow/acero/options.h" #ifndef NDEBUG -#include "arrow/acero/options_internal.h" +# include "arrow/acero/options_internal.h" #endif #include "arrow/acero/map_node.h" #include "arrow/acero/query_context.h" diff --git a/cpp/src/arrow/acero/bloom_filter.h b/cpp/src/arrow/acero/bloom_filter.h index 50d07bfd948e0..530beaea64827 100644 --- a/cpp/src/arrow/acero/bloom_filter.h +++ b/cpp/src/arrow/acero/bloom_filter.h @@ -18,7 +18,7 @@ #pragma once #if defined(ARROW_HAVE_RUNTIME_AVX2) -#include +# include #endif #include diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index a2d6e9575a1aa..30cafd120caea 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -503,9 +503,9 @@ TEST(BloomFilter, Scaling) { num_build.push_back(4000000); std::vector strategies; -#ifdef ARROW_ENABLE_THREADING +# ifdef ARROW_ENABLE_THREADING strategies.push_back(BloomFilterBuildStrategy::PARALLEL); -#endif +# endif strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); for (const auto hardware_flags : HardwareFlagsForTesting()) { diff --git a/cpp/src/arrow/acero/groupby_aggregate_node.cc b/cpp/src/arrow/acero/groupby_aggregate_node.cc index 723c8b7377e13..06b034ab2d459 100644 --- a/cpp/src/arrow/acero/groupby_aggregate_node.cc +++ b/cpp/src/arrow/acero/groupby_aggregate_node.cc @@ -369,13 +369,14 @@ Status GroupByNode::InputReceived(ExecNode* input, ExecBatch batch) { DCHECK_EQ(input, inputs_[0]); auto handler = [this](const ExecBatch& full_batch, const Segment& segment) { - if (!segment.extends && segment.offset == 0) RETURN_NOT_OK(OutputResult(false)); + if (!segment.extends && segment.offset == 0) + RETURN_NOT_OK(OutputResult(/*is_last=*/false)); auto exec_batch = full_batch.Slice(segment.offset, segment.length); auto batch = ExecSpan(exec_batch); RETURN_NOT_OK(Consume(batch)); RETURN_NOT_OK( ExtractSegmenterValues(&segmenter_values_, exec_batch, segment_key_field_ids_)); - if (!segment.is_open) RETURN_NOT_OK(OutputResult(false)); + if (!segment.is_open) RETURN_NOT_OK(OutputResult(/*is_last=*/false)); return Status::OK(); }; ARROW_RETURN_NOT_OK( diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 470960b1c5062..e3e37e249e6a3 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -104,7 +104,7 @@ class JoinBenchmark { key_cmp.push_back(JoinKeyCmp::EQ); } - for (size_t i = 0; i < settings.build_payload_types.size(); i++) { + for (size_t i = 0; i < settings.probe_payload_types.size(); i++) { std::string name = "lp" + std::to_string(i); DCHECK_OK(l_schema_builder.AddField(field(name, settings.probe_payload_types[i]))); } @@ -279,7 +279,7 @@ static void BM_HashJoinBasic_MatchesPerRow(benchmark::State& st) { settings.cardinality = 1.0 / static_cast(st.range(0)); settings.num_build_batches = static_cast(st.range(1)); - settings.num_probe_batches = settings.num_probe_batches; + settings.num_probe_batches = settings.num_build_batches; HashJoinBasicBenchmarkImpl(st, settings); } @@ -291,7 +291,7 @@ static void BM_HashJoinBasic_PayloadSize(benchmark::State& st) { settings.cardinality = 1.0 / static_cast(st.range(1)); settings.num_build_batches = static_cast(st.range(2)); - settings.num_probe_batches = settings.num_probe_batches; + settings.num_probe_batches = settings.num_build_batches; HashJoinBasicBenchmarkImpl(st, settings); } diff --git a/cpp/src/arrow/acero/hash_join_dict.cc b/cpp/src/arrow/acero/hash_join_dict.cc index 3aef08e6e9ccf..8db9dddb2c3a0 100644 --- a/cpp/src/arrow/acero/hash_join_dict.cc +++ b/cpp/src/arrow/acero/hash_join_dict.cc @@ -225,21 +225,20 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr dictiona return Status::OK(); } - dictionary_ = dictionary; + dictionary_ = std::move(dictionary); // Initialize encoder RowEncoder encoder; - std::vector encoder_types; - encoder_types.emplace_back(value_type_); + std::vector encoder_types{value_type_}; encoder.Init(encoder_types, ctx); // Encode all dictionary values - int64_t length = dictionary->data()->length; + int64_t length = dictionary_->data()->length; if (length >= std::numeric_limits::max()) { return Status::Invalid( "Dictionary length in hash join must fit into signed 32-bit integer."); } - RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary->data()}, length))); + RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary_->data()}, length))); std::vector entries_to_take; diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index 67f902e64be93..80dd163ced740 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -61,30 +61,30 @@ Result> HashJoinSchema::ComputePayload( const std::vector& filter, const std::vector& keys) { // payload = (output + filter) - keys, with no duplicates std::unordered_set payload_fields; - for (auto ref : output) { + for (const auto& ref : output) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.insert(match[0]); } - for (auto ref : filter) { + for (const auto& ref : filter) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.insert(match[0]); } - for (auto ref : keys) { + for (const auto& ref : keys) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.erase(match[0]); } std::vector payload_refs; - for (auto ref : output) { + for (const auto& ref : output) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); if (payload_fields.find(match[0]) != payload_fields.end()) { payload_refs.push_back(ref); payload_fields.erase(match[0]); } } - for (auto ref : filter) { + for (const auto& ref : filter) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); if (payload_fields.find(match[0]) != payload_fields.end()) { payload_refs.push_back(ref); @@ -198,7 +198,7 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc return Status::Invalid("Different number of key fields on left (", left_keys.size(), ") and right (", right_keys.size(), ") side of the join"); } - if (left_keys.size() < 1) { + if (left_keys.empty()) { return Status::Invalid("Join key cannot be empty"); } for (size_t i = 0; i < left_keys.size() + right_keys.size(); ++i) { @@ -432,7 +432,7 @@ Status HashJoinSchema::CollectFilterColumns(std::vector& left_filter, indices[0] -= left_schema.num_fields(); FieldPath corrected_path(std::move(indices)); if (right_seen_paths.find(*path) == right_seen_paths.end()) { - right_filter.push_back(corrected_path); + right_filter.emplace_back(corrected_path); right_seen_paths.emplace(std::move(corrected_path)); } } else if (left_seen_paths.find(*path) == left_seen_paths.end()) { @@ -698,7 +698,7 @@ class HashJoinNode : public ExecNode, public TracedNode { std::shared_ptr output_schema, std::unique_ptr schema_mgr, Expression filter, std::unique_ptr impl) - : ExecNode(plan, inputs, {"left", "right"}, + : ExecNode(plan, std::move(inputs), {"left", "right"}, /*output_schema=*/std::move(output_schema)), TracedNode(this), join_type_(join_options.join_type), diff --git a/cpp/src/arrow/acero/hash_join_node.h b/cpp/src/arrow/acero/hash_join_node.h index ad60019ceabc4..19745b8675cf0 100644 --- a/cpp/src/arrow/acero/hash_join_node.h +++ b/cpp/src/arrow/acero/hash_join_node.h @@ -65,9 +65,9 @@ class ARROW_ACERO_EXPORT HashJoinSchema { std::shared_ptr MakeOutputSchema(const std::string& left_field_name_suffix, const std::string& right_field_name_suffix); - bool LeftPayloadIsEmpty() { return PayloadIsEmpty(0); } + bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); } - bool RightPayloadIsEmpty() { return PayloadIsEmpty(1); } + bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); } static int kMissingField() { return SchemaProjectionMaps::kMissingField; @@ -88,7 +88,7 @@ class ARROW_ACERO_EXPORT HashJoinSchema { const SchemaProjectionMap& right_to_filter, const Expression& filter); - bool PayloadIsEmpty(int side) { + bool PayloadIsEmpty(int side) const { assert(side == 0 || side == 1); return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0; } diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 9065e286a2228..76ad9c7d650eb 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -29,6 +29,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" diff --git a/cpp/src/arrow/acero/options_internal.h b/cpp/src/arrow/acero/options_internal.h index d4bf79a7cd008..fd3ea78116572 100644 --- a/cpp/src/arrow/acero/options_internal.h +++ b/cpp/src/arrow/acero/options_internal.h @@ -18,8 +18,8 @@ #pragma once #ifndef NDEBUG -#include -#include +# include +# include #endif namespace arrow { diff --git a/cpp/src/arrow/acero/scalar_aggregate_node.cc b/cpp/src/arrow/acero/scalar_aggregate_node.cc index c7805f4d24eb2..b34f7511cc12b 100644 --- a/cpp/src/arrow/acero/scalar_aggregate_node.cc +++ b/cpp/src/arrow/acero/scalar_aggregate_node.cc @@ -234,7 +234,8 @@ Status ScalarAggregateNode::InputReceived(ExecNode* input, ExecBatch batch) { // (1) The segment is starting of a new segment group and points to // the beginning of the batch, then it means no data in the batch belongs // to the current segment group. We can output and reset kernel states. - if (!segment.extends && segment.offset == 0) RETURN_NOT_OK(OutputResult(false)); + if (!segment.extends && segment.offset == 0) + RETURN_NOT_OK(OutputResult(/*is_last=*/false)); // We add segment to the current segment group aggregation auto exec_batch = full_batch.Slice(segment.offset, segment.length); @@ -244,7 +245,7 @@ Status ScalarAggregateNode::InputReceived(ExecNode* input, ExecBatch batch) { // If the segment closes the current segment group, we can output segment group // aggregation. - if (!segment.is_open) RETURN_NOT_OK(OutputResult(false)); + if (!segment.is_open) RETURN_NOT_OK(OutputResult(/*is_last=*/false)); return Status::OK(); }; diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 4d0c8187ac6e2..6c783110af571 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1667,7 +1667,7 @@ Result> JoinResultMaterialize::FlushBuildColumn( const std::shared_ptr& data_type, const RowArray* row_array, int column_id, uint32_t* row_ids) { ResizableArrayData output; - output.Init(data_type, pool_, bit_util::Log2(num_rows_)); + RETURN_NOT_OK(output.Init(data_type, pool_, bit_util::Log2(num_rows_))); for (size_t i = 0; i <= null_ranges_.size(); ++i) { int row_id_begin = @@ -2247,8 +2247,9 @@ Result JoinResidualFilter::MaterializeFilterInput( build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD); for (int i = 0; i < num_build_cols; ++i) { ResizableArrayData column_data; - column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), pool_, - bit_util::Log2(num_batch_rows)); + RETURN_NOT_OK( + column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), + pool_, bit_util::Log2(num_batch_rows))); if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) { RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows, key_ids_maybe_null, pool_)); diff --git a/cpp/src/arrow/acero/visibility.h b/cpp/src/arrow/acero/visibility.h index 02382232b69dd..21a697a56eca9 100644 --- a/cpp/src/arrow/acero/visibility.h +++ b/cpp/src/arrow/acero/visibility.h @@ -20,31 +20,31 @@ #pragma once #if defined(_WIN32) || defined(__CYGWIN__) -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif -#ifdef ARROW_ACERO_STATIC -#define ARROW_ACERO_EXPORT -#elif defined(ARROW_ACERO_EXPORTING) -#define ARROW_ACERO_EXPORT __declspec(dllexport) -#else -#define ARROW_ACERO_EXPORT __declspec(dllimport) -#endif +# ifdef ARROW_ACERO_STATIC +# define ARROW_ACERO_EXPORT +# elif defined(ARROW_ACERO_EXPORTING) +# define ARROW_ACERO_EXPORT __declspec(dllexport) +# else +# define ARROW_ACERO_EXPORT __declspec(dllimport) +# endif -#define ARROW_ACERO_NO_EXPORT +# define ARROW_ACERO_NO_EXPORT #else // Not Windows -#ifndef ARROW_ACERO_EXPORT -#define ARROW_ACERO_EXPORT __attribute__((visibility("default"))) -#endif -#ifndef ARROW_ACERO_NO_EXPORT -#define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden"))) -#endif +# ifndef ARROW_ACERO_EXPORT +# define ARROW_ACERO_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_ACERO_NO_EXPORT +# define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden"))) +# endif #endif // Not-Windows #if defined(_MSC_VER) -#pragma warning(pop) +# pragma warning(pop) #endif diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 25759f8471365..d16b6cfd2e97d 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -25,7 +25,7 @@ #include #ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK -#include +# include #endif #include "arrow/adapters/orc/util.h" diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 716ae0722069e..e4af67d7e5f0b 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -232,6 +232,14 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const ArrayStatistics& + std::shared_ptr statistics() const { return data_->statistics; } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc index 7fd76a1dae81b..55e086af30bc2 100644 --- a/cpp/src/arrow/array/array_dict.cc +++ b/cpp/src/arrow/array/array_dict.cc @@ -349,7 +349,7 @@ class DictionaryUnifierImpl : public DictionaryUnifier { using MemoTableType = typename DictTraits::MemoTableType; DictionaryUnifierImpl(MemoryPool* pool, std::shared_ptr value_type) - : pool_(pool), value_type_(value_type), memo_table_(pool) {} + : pool_(pool), value_type_(std::move(value_type)), memo_table_(pool) {} Status Unify(const Array& dictionary, std::shared_ptr* out) override { if (dictionary.null_count() > 0) { @@ -432,7 +432,7 @@ struct MakeUnifier { std::unique_ptr result; MakeUnifier(MemoryPool* pool, std::shared_ptr value_type) - : pool(pool), value_type(value_type) {} + : pool(pool), value_type(std::move(value_type)) {} template enable_if_no_memoize Visit(const T&) { diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 47c0fd35829a1..bb469df1ad6b4 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -542,7 +542,7 @@ Result> ListArray::FromArrays( const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { return ListArrayFromArrays(std::make_shared(values.type()), offsets, - values, pool, null_bitmap, null_count); + values, pool, std::move(null_bitmap), null_count); } Result> ListArray::FromListView(const ListViewArray& source, @@ -563,7 +563,7 @@ Result> ListArray::FromArrays( return Status::TypeError("Mismatching list value type"); } return ListArrayFromArrays(std::move(type), offsets, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> ListArray::Flatten(MemoryPool* memory_pool) const { @@ -599,8 +599,8 @@ Result> LargeListArray::FromArrays( const Array& offsets, const Array& values, MemoryPool* pool, std::shared_ptr null_bitmap, int64_t null_count) { return ListArrayFromArrays( - std::make_shared(values.type()), offsets, values, pool, null_bitmap, - null_count); + std::make_shared(values.type()), offsets, values, pool, + std::move(null_bitmap), null_count); } Result> LargeListArray::FromListView( @@ -622,7 +622,7 @@ Result> LargeListArray::FromArrays( return Status::TypeError("Mismatching list value type"); } return ListArrayFromArrays(std::move(type), offsets, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> LargeListArray::Flatten(MemoryPool* memory_pool) const { @@ -662,7 +662,7 @@ Result> ListViewArray::FromArrays( std::shared_ptr null_bitmap, int64_t null_count) { return ListViewArrayFromArrays( std::make_shared(values.type()), offsets, sizes, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> ListViewArray::FromArrays( @@ -677,7 +677,7 @@ Result> ListViewArray::FromArrays( return Status::TypeError("Mismatching list-view value type"); } return ListViewArrayFromArrays(std::move(type), offsets, sizes, values, - pool, null_bitmap, null_count); + pool, std::move(null_bitmap), null_count); } Result> ListViewArray::FromList(const ListArray& source, @@ -722,7 +722,7 @@ LargeListViewArray::LargeListViewArray(std::shared_ptr type, int64_t l std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { LargeListViewArray::SetData(ArrayData::Make( - type, length, + std::move(type), length, {std::move(null_bitmap), std::move(value_offsets), std::move(value_sizes)}, /*child_data=*/{values->data()}, null_count, offset)); } @@ -737,7 +737,7 @@ Result> LargeListViewArray::FromArrays( std::shared_ptr null_bitmap, int64_t null_count) { return ListViewArrayFromArrays( std::make_shared(values.type()), offsets, sizes, values, pool, - null_bitmap, null_count); + std::move(null_bitmap), null_count); } Result> LargeListViewArray::FromArrays( @@ -752,7 +752,7 @@ Result> LargeListViewArray::FromArrays( return Status::TypeError("Mismatching large list-view value type"); } return ListViewArrayFromArrays( - std::move(type), offsets, sizes, values, pool, null_bitmap, null_count); + std::move(type), offsets, sizes, values, pool, std::move(null_bitmap), null_count); } Result> LargeListViewArray::Flatten( @@ -854,8 +854,9 @@ Result> MapArray::FromArraysInternal( null_count = kUnknownNullCount; } buffers[1] = typed_offsets.values(); - return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, - items, /*null_count=*/null_count, offsets->offset()); + return std::make_shared(std::move(type), offsets->length() - 1, + std::move(buffers), keys, items, + /*null_count=*/null_count, offsets->offset()); } Result> MapArray::FromArrays(const std::shared_ptr& offsets, @@ -971,8 +972,8 @@ Result> FixedSizeListArray::FromArrays( int64_t length = values->length() / list_size; auto list_type = std::make_shared(values->type(), list_size); - return std::make_shared(list_type, length, values, null_bitmap, - null_count); + return std::make_shared(list_type, length, values, + std::move(null_bitmap), null_count); } Result> FixedSizeListArray::FromArrays( @@ -992,8 +993,8 @@ Result> FixedSizeListArray::FromArrays( } int64_t length = values->length() / list_type.list_size(); - return std::make_shared(type, length, values, null_bitmap, - null_count); + return std::make_shared(std::move(type), length, values, + std::move(null_bitmap), null_count); } Result> FixedSizeListArray::Flatten( @@ -1015,7 +1016,7 @@ StructArray::StructArray(const std::shared_ptr& type, int64_t length, std::shared_ptr null_bitmap, int64_t null_count, int64_t offset) { ARROW_CHECK_EQ(type->id(), Type::STRUCT); - SetData(ArrayData::Make(type, length, {null_bitmap}, null_count, offset)); + SetData(ArrayData::Make(type, length, {std::move(null_bitmap)}, null_count, offset)); for (const auto& child : children) { data_->child_data.push_back(child->data()); } @@ -1048,7 +1049,7 @@ Result> StructArray::Make( null_count = 0; } return std::make_shared(struct_(fields), length - offset, children, - null_bitmap, null_count, offset); + std::move(null_bitmap), null_count, offset); } Result> StructArray::Make( @@ -1085,8 +1086,8 @@ const std::shared_ptr& StructArray::field(int i) const { } else { field_data = data_->child_data[i]; } - std::shared_ptr result = MakeArray(field_data); - std::atomic_store(&boxed_fields_[i], result); + result = MakeArray(field_data); + std::atomic_store(&boxed_fields_[i], std::move(result)); return boxed_fields_[i]; } return boxed_fields_[i]; diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 32806d9d2edb3..73e0c692432b6 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) { } } +class TestArrayDataStatistics : public ::testing::Test { + public: + void SetUp() { + valids_ = {1, 0, 1, 1}; + null_count_ = std::count(valids_.begin(), valids_.end(), 0); + null_buffer_ = *internal::BytesToBits(valids_); + values_ = {1, 0, 3, -4}; + min_ = *std::min_element(values_.begin(), values_.end()); + max_ = *std::max_element(values_.begin(), values_.end()); + values_buffer_ = Buffer::FromVector(values_); + data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_}, + null_count_); + data_->statistics = std::make_shared(); + data_->statistics->null_count = null_count_; + data_->statistics->min = min_; + data_->statistics->is_min_exact = true; + data_->statistics->max = max_; + data_->statistics->is_max_exact = true; + } + + protected: + std::vector valids_; + size_t null_count_; + std::shared_ptr null_buffer_; + std::vector values_; + int64_t min_; + int64_t max_; + std::shared_ptr values_buffer_; + std::shared_ptr data_; +}; + +TEST_F(TestArrayDataStatistics, MoveConstructor) { + ArrayData copied_data(*data_); + ArrayData moved_data(std::move(copied_data)); + + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); + + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyConstructor) { + ArrayData copied_data(*data_); + + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); + + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, MoveAssignment) { + ArrayData copied_data(*data_); + ArrayData moved_data; + moved_data = std::move(copied_data); + + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); + + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyAssignment) { + ArrayData copied_data; + copied_data = *data_; + + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); + + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyTo) { + ASSERT_OK_AND_ASSIGN(auto copied_data, + data_->CopyTo(arrow::default_cpu_memory_manager())); + + ASSERT_TRUE(copied_data->statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data->statistics->null_count.value()); + + ASSERT_TRUE(copied_data->statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data->statistics->min.value())); + ASSERT_TRUE(copied_data->statistics->is_min_exact); + + ASSERT_TRUE(copied_data->statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data->statistics->max.value())); + ASSERT_TRUE(copied_data->statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, Slice) { + auto sliced_data = data_->Slice(0, 1); + ASSERT_FALSE(sliced_data->statistics); +} + template class TestPrimitiveArray : public ::testing::Test { public: diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 83eeb56c496cf..8e29297a8c175 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -165,6 +165,8 @@ Result> CopyToImpl(const ArrayData& data, ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn)); } + output->statistics = data.statistics; + return output; } } // namespace @@ -195,6 +197,7 @@ std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { } else { copy->null_count = null_count != 0 ? kUnknownNullCount : 0; } + copy->statistics = nullptr; return copy; } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e0508fe6980a7..1e6ee9a1d32ff 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)) { + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) { SetNullCount(other.null_count); } @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(other.buffers), child_data(other.child_data), - dictionary(other.dictionary) { + dictionary(other.dictionary), + statistics(other.statistics) { SetNullCount(other.null_count); } @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData { buffers = std::move(other.buffers); child_data = std::move(other.child_data); dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); return *this; } @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData { buffers = other.buffers; child_data = other.child_data; dictionary = other.dictionary; + statistics = other.statistics; return *this; } @@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData { } /// \brief Construct a zero-copy slice of the data with the given offset and length + /// + /// The associated `ArrayStatistics` is always discarded in a sliced + /// `ArrayData`. Because `ArrayStatistics` in the original + /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want + /// to reuse statistics in the original `ArrayData`, you need to do + /// it by yourself. + /// + /// If the specified slice range has the same range as the original + /// `ArrayData`, we can reuse statistics in the original + /// `ArrayData`. Because it has the same data as the original + /// `ArrayData`. But the associated `ArrayStatistics` is discarded + /// in this case too. Use `Copy()` instead for the case. std::shared_ptr Slice(int64_t offset, int64_t length) const; /// \brief Input-checking variant of Slice @@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData { // The dictionary for this Array, if any. Only used for dictionary type std::shared_ptr dictionary; + + // The statistics for this Array. + std::shared_ptr statistics; }; /// \brief A non-owning Buffer reference diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 0d940d3bc869e..69f1646054f4c 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -985,10 +985,22 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || - data.type->id() == Type::LARGE_STRING); - UTF8DataValidator validator{data}; - return VisitTypeInline(*data.type, &validator); + const auto& storage_type = + (data.type->id() == Type::EXTENSION) + ? checked_cast(*data.type).storage_type() + : data.type; + DCHECK(storage_type->id() == Type::STRING || storage_type->id() == Type::STRING_VIEW || + storage_type->id() == Type::LARGE_STRING); + + if (data.type->id() == Type::EXTENSION) { + ArrayData ext_data(data); + ext_data.type = storage_type; + UTF8DataValidator validator{ext_data}; + return VisitTypeInline(*storage_type, &validator); + } else { + UTF8DataValidator validator{data}; + return VisitTypeInline(*storage_type, &validator); + } } ARROW_EXPORT diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h index 6abe866b5f6f6..db051fff5ff05 100644 --- a/cpp/src/arrow/c/abi.h +++ b/cpp/src/arrow/c/abi.h @@ -41,11 +41,11 @@ extern "C" { #endif #ifndef ARROW_C_DATA_INTERFACE -#define ARROW_C_DATA_INTERFACE +# define ARROW_C_DATA_INTERFACE -#define ARROW_FLAG_DICTIONARY_ORDERED 1 -#define ARROW_FLAG_NULLABLE 2 -#define ARROW_FLAG_MAP_KEYS_SORTED 4 +# define ARROW_FLAG_DICTIONARY_ORDERED 1 +# define ARROW_FLAG_NULLABLE 2 +# define ARROW_FLAG_MAP_KEYS_SORTED 4 struct ArrowSchema { // Array type description @@ -83,7 +83,7 @@ struct ArrowArray { #endif // ARROW_C_DATA_INTERFACE #ifndef ARROW_C_DEVICE_DATA_INTERFACE -#define ARROW_C_DEVICE_DATA_INTERFACE +# define ARROW_C_DEVICE_DATA_INTERFACE // Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html @@ -91,33 +91,33 @@ struct ArrowArray { typedef int32_t ArrowDeviceType; // CPU device, same as using ArrowArray directly -#define ARROW_DEVICE_CPU 1 +# define ARROW_DEVICE_CPU 1 // CUDA GPU Device -#define ARROW_DEVICE_CUDA 2 +# define ARROW_DEVICE_CUDA 2 // Pinned CUDA CPU memory by cudaMallocHost -#define ARROW_DEVICE_CUDA_HOST 3 +# define ARROW_DEVICE_CUDA_HOST 3 // OpenCL Device -#define ARROW_DEVICE_OPENCL 4 +# define ARROW_DEVICE_OPENCL 4 // Vulkan buffer for next-gen graphics -#define ARROW_DEVICE_VULKAN 7 +# define ARROW_DEVICE_VULKAN 7 // Metal for Apple GPU -#define ARROW_DEVICE_METAL 8 +# define ARROW_DEVICE_METAL 8 // Verilog simulator buffer -#define ARROW_DEVICE_VPI 9 +# define ARROW_DEVICE_VPI 9 // ROCm GPUs for AMD GPUs -#define ARROW_DEVICE_ROCM 10 +# define ARROW_DEVICE_ROCM 10 // Pinned ROCm CPU memory allocated by hipMallocHost -#define ARROW_DEVICE_ROCM_HOST 11 +# define ARROW_DEVICE_ROCM_HOST 11 // Reserved for extension -#define ARROW_DEVICE_EXT_DEV 12 +# define ARROW_DEVICE_EXT_DEV 12 // CUDA managed/unified memory allocated by cudaMallocManaged -#define ARROW_DEVICE_CUDA_MANAGED 13 +# define ARROW_DEVICE_CUDA_MANAGED 13 // unified shared memory allocated on a oneAPI non-partitioned device. -#define ARROW_DEVICE_ONEAPI 14 +# define ARROW_DEVICE_ONEAPI 14 // GPU support for next-gen WebGPU standard -#define ARROW_DEVICE_WEBGPU 15 +# define ARROW_DEVICE_WEBGPU 15 // Qualcomm Hexagon DSP -#define ARROW_DEVICE_HEXAGON 16 +# define ARROW_DEVICE_HEXAGON 16 struct ArrowDeviceArray { // the Allocated Array @@ -138,7 +138,7 @@ struct ArrowDeviceArray { #endif // ARROW_C_DEVICE_DATA_INTERFACE #ifndef ARROW_C_STREAM_INTERFACE -#define ARROW_C_STREAM_INTERFACE +# define ARROW_C_STREAM_INTERFACE struct ArrowArrayStream { // Callback to get the stream type @@ -179,7 +179,7 @@ struct ArrowArrayStream { #endif // ARROW_C_STREAM_INTERFACE #ifndef ARROW_C_DEVICE_STREAM_INTERFACE -#define ARROW_C_DEVICE_STREAM_INTERFACE +# define ARROW_C_DEVICE_STREAM_INTERFACE // Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. // diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 09bb524adbdf0..01fd56f631d99 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -48,7 +48,7 @@ // TODO(GH-37221): Remove these ifdef checks when compute dependency is removed #ifdef ARROW_COMPUTE -#include "arrow/compute/api_vector.h" +# include "arrow/compute/api_vector.h" #endif namespace arrow { diff --git a/cpp/src/arrow/c/dlpack_abi.h b/cpp/src/arrow/c/dlpack_abi.h index 4af557a7ed5d7..fbe2a56a344b3 100644 --- a/cpp/src/arrow/c/dlpack_abi.h +++ b/cpp/src/arrow/c/dlpack_abi.h @@ -12,9 +12,9 @@ * \brief Compatibility with C++ */ #ifdef __cplusplus -#define DLPACK_EXTERN_C extern "C" +# define DLPACK_EXTERN_C extern "C" #else -#define DLPACK_EXTERN_C +# define DLPACK_EXTERN_C #endif /*! \brief The current major version of dlpack */ @@ -25,13 +25,13 @@ /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 -#ifdef DLPACK_EXPORTS -#define DLPACK_DLL __declspec(dllexport) +# ifdef DLPACK_EXPORTS +# define DLPACK_DLL __declspec(dllexport) +# else +# define DLPACK_DLL __declspec(dllimport) +# endif #else -#define DLPACK_DLL __declspec(dllimport) -#endif -#else -#define DLPACK_DLL +# define DLPACK_DLL #endif #include diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 55eec53ced1c7..854127480744e 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -60,42 +60,38 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { template void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets, int64_t n_indices, const IndexType* logical_index_vec, - IndexType* out_chunk_index_vec, IndexType chunk_hint, - IndexType* out_index_in_chunk_vec) { + TypedChunkLocation* out_chunk_location_vec, + IndexType chunk_hint) { auto* offsets = reinterpret_cast(signed_offsets); const auto num_chunks = static_cast(num_offsets - 1); // chunk_hint in [0, num_offsets) per the precondition. for (int64_t i = 0; i < n_indices; i++) { - const auto index = static_cast(logical_index_vec[i]); + auto typed_logical_index = logical_index_vec[i]; + const auto index = static_cast(typed_logical_index); + // use or update chunk_hint if (index >= offsets[chunk_hint] && (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) { - out_chunk_index_vec[i] = chunk_hint; // hint is correct! - continue; + // hint is correct! + } else { + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + auto chunk_index = + ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + chunk_hint = static_cast(chunk_index); } - // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` - auto chunk_index = - ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); - chunk_hint = static_cast(chunk_index); - out_chunk_index_vec[i] = chunk_hint; - } - if (out_index_in_chunk_vec != NULLPTR) { - for (int64_t i = 0; i < n_indices; i++) { - auto logical_index = logical_index_vec[i]; - auto chunk_index = out_chunk_index_vec[i]; - // chunk_index is in [0, chunks.size()] no matter what the - // value of logical_index is, so it's always safe to dereference - // offset_ as it contains chunks.size()+1 values. - out_index_in_chunk_vec[i] = - logical_index - static_cast(offsets[chunk_index]); + out_chunk_location_vec[i].chunk_index = chunk_hint; + // chunk_index is in [0, chunks.size()] no matter what the + // value of logical_index is, so it's always safe to dereference + // offset_ as it contains chunks.size()+1 values. + out_chunk_location_vec[i].index_in_chunk = + typed_logical_index - static_cast(offsets[chunk_hint]); #if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) - // Make it more likely that Valgrind/ASAN can catch an invalid memory - // access by poisoning out_index_in_chunk_vec[i] when the logical - // index is out-of-bounds. - if (chunk_index == num_chunks) { - out_index_in_chunk_vec[i] = std::numeric_limits::max(); - } -#endif + // Make it more likely that Valgrind/ASAN can catch an invalid memory + // access by poisoning the index-in-chunk value when the logical + // index is out-of-bounds. + if (chunk_hint == num_chunks) { + out_chunk_location_vec[i].index_in_chunk = std::numeric_limits::max(); } +#endif } } @@ -130,31 +126,31 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept { } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec, - uint8_t* out_chunk_index_vec, uint8_t chunk_hint, - uint8_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint8_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec, - uint32_t* out_chunk_index_vec, uint32_t chunk_hint, - uint32_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint32_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec, - uint16_t* out_chunk_index_vec, uint16_t chunk_hint, - uint16_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint16_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec, - uint64_t* out_chunk_index_vec, uint64_t chunk_hint, - uint64_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint64_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } } // namespace arrow::internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index a2a3d5a864243..83fda62387fe1 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -31,28 +31,34 @@ namespace arrow::internal { struct ChunkResolver; -struct ChunkLocation { +template +struct TypedChunkLocation { /// \brief Index of the chunk in the array of chunks /// /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used /// to represent out-of-bounds locations. - int64_t chunk_index = 0; + IndexType chunk_index = 0; /// \brief Index of the value in the chunk /// /// The value is UNDEFINED if chunk_index >= chunks.size() - int64_t index_in_chunk = 0; + IndexType index_in_chunk = 0; - ChunkLocation() = default; + TypedChunkLocation() = default; - ChunkLocation(int64_t chunk_index, int64_t index_in_chunk) - : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {} + TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) { + static_assert(sizeof(TypedChunkLocation) == 2 * sizeof(IndexType)); + static_assert(alignof(TypedChunkLocation) == alignof(IndexType)); + } - bool operator==(ChunkLocation other) const { + bool operator==(TypedChunkLocation other) const { return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; } }; +using ChunkLocation = TypedChunkLocation; + /// \brief An utility that incrementally resolves logical indices into /// physical indices in a chunked array. struct ARROW_EXPORT ChunkResolver { @@ -144,26 +150,25 @@ struct ARROW_EXPORT ChunkResolver { /// /// \pre 0 <= logical_index_vec[i] < logical_array_length() /// (for well-defined and valid chunk index results) - /// \pre out_chunk_index_vec has space for `n_indices` + /// \pre out_chunk_location_vec has space for `n_indices` locations /// \pre chunk_hint in [0, chunks.size()] - /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) + /// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n) /// \post if logical_index_vec[i] >= chunked_array.length(), then - /// out_chunk_index_vec[i] == chunks.size() - /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) - /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and - /// out_index_in_chunk_vec[i] are UNDEFINED + /// out_chunk_location_vec[i].chunk_index == chunks.size() + /// and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be + /// out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i] + /// are UNDEFINED /// /// \param n_indices The number of logical indices to resolve /// \param logical_index_vec The logical indices to resolve - /// \param out_chunk_index_vec The output array where the chunk indices will be written + /// \param out_chunk_location_vec The output array where the locations will be written /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany - /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the - /// within-chunk indices will be written /// \return false iff chunks.size() > std::numeric_limits::max() template [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, - IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, - IndexType* out_index_in_chunk_vec = NULLPTR) const { + TypedChunkLocation* out_chunk_location_vec, + IndexType chunk_hint = 0) const { if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); @@ -188,13 +193,11 @@ struct ARROW_EXPORT ChunkResolver { // logical index in the chunked array. using U = std::make_unsigned_t; ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), - reinterpret_cast(out_chunk_index_vec), - static_cast(chunk_hint), - reinterpret_cast(out_index_in_chunk_vec)); + reinterpret_cast*>(out_chunk_location_vec), + static_cast(chunk_hint)); } else { static_assert(std::is_unsigned_v); - ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, - out_index_in_chunk_vec); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec, chunk_hint); } return true; } @@ -226,10 +229,14 @@ struct ARROW_EXPORT ChunkResolver { /// \pre all the pre-conditions of ChunkResolver::ResolveMany() /// \pre num_offsets - 1 <= std::numeric_limits::max() - void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const; - void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const; - void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const; - void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const; + void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation*, + uint8_t) const; + void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation*, + uint16_t) const; + void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation*, + uint32_t) const; + void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation*, + uint64_t) const; public: /// \brief Find the index of the chunk that contains the logical index. diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index c36b736d5d5df..dd6aa51534fcb 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -27,6 +27,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" #include "arrow/array/validate.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/type.h" @@ -86,6 +87,18 @@ Result> ChunkedArray::MakeEmpty( return std::make_shared(std::move(new_chunks)); } +DeviceAllocationTypeSet ChunkedArray::device_types() const { + if (chunks_.empty()) { + // An empty ChunkedArray is considered to be CPU-only. + return DeviceAllocationTypeSet::CpuOnly(); + } + DeviceAllocationTypeSet set; + for (const auto& chunk : chunks_) { + set.add(chunk->device_type()); + } + return set; +} + bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) const { if (length_ != other.length()) { return false; diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 5d300861d85c2..c65b6cb6e227f 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -25,6 +25,7 @@ #include "arrow/chunk_resolver.h" #include "arrow/compare.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -116,6 +117,13 @@ class ARROW_EXPORT ChunkedArray { /// \return an ArrayVector of chunks const ArrayVector& chunks() const { return chunks_; } + /// \return The set of device allocation types used by the chunks in this + /// chunked array. + DeviceAllocationTypeSet device_types() const; + + /// \return true if all chunks are allocated on CPU-accessible memory. + bool is_cpu() const { return device_types().is_cpu_only(); } + /// \brief Construct a zero-copy slice of the chunked array with the /// indicated offset and length /// diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index e9cc283b53cd5..bf9d4af7c7bb0 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -37,6 +37,7 @@ namespace arrow { using internal::ChunkLocation; using internal::ChunkResolver; +using internal::TypedChunkLocation; class TestChunkedArray : public ::testing::Test { protected: @@ -61,12 +62,17 @@ TEST_F(TestChunkedArray, Make) { ChunkedArray::Make({}, int64())); AssertTypeEqual(*int64(), *result->type()); ASSERT_EQ(result->num_chunks(), 0); + // Empty chunked arrays are treated as CPU-allocated. + ASSERT_TRUE(result->is_cpu()); auto chunk0 = ArrayFromJSON(int8(), "[0, 1, 2]"); auto chunk1 = ArrayFromJSON(int16(), "[3, 4, 5]"); ASSERT_OK_AND_ASSIGN(result, ChunkedArray::Make({chunk0, chunk0})); ASSERT_OK_AND_ASSIGN(auto result2, ChunkedArray::Make({chunk0, chunk0}, int8())); + // All chunks are CPU-accessible. + ASSERT_TRUE(result->is_cpu()); + ASSERT_TRUE(result2->is_cpu()); AssertChunkedEqual(*result, *result2); ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0, chunk1})); @@ -375,24 +381,26 @@ class TestChunkResolverMany : public ::testing::Test { Result> ResolveMany( const ChunkResolver& resolver, const std::vector& logical_index_vec) { const size_t n = logical_index_vec.size(); - std::vector chunk_index_vec; - chunk_index_vec.resize(n); - std::vector index_in_chunk_vec; - index_in_chunk_vec.resize(n); + std::vector> chunk_location_vec; + chunk_location_vec.resize(n); bool valid = resolver.ResolveMany( - static_cast(n), logical_index_vec.data(), chunk_index_vec.data(), 0, - index_in_chunk_vec.data()); + static_cast(n), logical_index_vec.data(), chunk_location_vec.data(), 0); if (ARROW_PREDICT_FALSE(!valid)) { return Status::Invalid("index type doesn't fit possible chunk indexes"); } - std::vector locations; - locations.reserve(n); - for (size_t i = 0; i < n; i++) { - auto chunk_index = static_cast(chunk_index_vec[i]); - auto index_in_chunk = static_cast(index_in_chunk_vec[i]); - locations.emplace_back(chunk_index, index_in_chunk); + if constexpr (std::is_same::value) { + return chunk_location_vec; + } else { + std::vector locations; + locations.reserve(n); + for (size_t i = 0; i < n; i++) { + auto loc = chunk_location_vec[i]; + auto chunk_index = static_cast(loc.chunk_index); + auto index_in_chunk = static_cast(loc.index_in_chunk); + locations.emplace_back(chunk_index, index_in_chunk); + } + return locations; } - return locations; } void CheckResolveMany(const ChunkResolver& resolver, diff --git a/cpp/src/arrow/compute/expression.cc b/cpp/src/arrow/compute/expression.cc index 33e5928c2865d..12fda5d58f3bf 100644 --- a/cpp/src/arrow/compute/expression.cc +++ b/cpp/src/arrow/compute/expression.cc @@ -23,6 +23,7 @@ #include #include "arrow/chunked_array.h" +#include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/expression_internal.h" @@ -1242,6 +1243,72 @@ struct Inequality { /*insert_implicit_casts=*/false, &exec_context); } + /// Simplify an `is_in` call against an inequality guarantee. + /// + /// We avoid the complexity of fully simplifying EQUAL comparisons to true + /// literals (e.g., 'x is_in [1, 2, 3]' given the guarantee 'x = 2') due to + /// potential complications with null matching behavior. This is ok for the + /// predicate pushdown use case because the overall aim is to simplify to an + /// unsatisfiable expression. + /// + /// \pre `is_in_call` is a call to the `is_in` function + /// \return a simplified expression, or nullopt if no simplification occurred + static Result> SimplifyIsIn( + const Inequality& guarantee, const Expression::Call* is_in_call) { + DCHECK_EQ(is_in_call->function_name, "is_in"); + + auto options = checked_pointer_cast(is_in_call->options); + + const auto& lhs = Comparison::StripOrderPreservingCasts(is_in_call->arguments[0]); + if (!lhs.field_ref()) return std::nullopt; + if (*lhs.field_ref() != guarantee.target) return std::nullopt; + + FilterOptions::NullSelectionBehavior null_selection; + switch (options->null_matching_behavior) { + case SetLookupOptions::MATCH: + null_selection = + guarantee.nullable ? FilterOptions::EMIT_NULL : FilterOptions::DROP; + break; + case SetLookupOptions::SKIP: + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::EMIT_NULL: + if (guarantee.nullable) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + case SetLookupOptions::INCONCLUSIVE: + if (guarantee.nullable) return std::nullopt; + ARROW_ASSIGN_OR_RAISE(Datum is_null, IsNull(options->value_set)); + ARROW_ASSIGN_OR_RAISE(Datum any_null, Any(is_null)); + if (any_null.scalar_as().value) return std::nullopt; + null_selection = FilterOptions::DROP; + break; + } + + std::string func_name = Comparison::GetName(guarantee.cmp); + DCHECK_NE(func_name, "na"); + std::vector args{options->value_set, guarantee.bound}; + ARROW_ASSIGN_OR_RAISE(Datum filter_mask, CallFunction(func_name, args)); + FilterOptions filter_options(null_selection); + ARROW_ASSIGN_OR_RAISE(Datum simplified_value_set, + Filter(options->value_set, filter_mask, filter_options)); + + if (simplified_value_set.length() == 0) return literal(false); + if (simplified_value_set.length() == options->value_set.length()) return std::nullopt; + + ExecContext exec_context; + Expression::Call simplified_call; + simplified_call.function_name = "is_in"; + simplified_call.arguments = is_in_call->arguments; + simplified_call.options = std::make_shared( + simplified_value_set, options->null_matching_behavior); + ARROW_ASSIGN_OR_RAISE( + Expression simplified_expr, + BindNonRecursive(std::move(simplified_call), + /*insert_implicit_casts=*/false, &exec_context)); + return simplified_expr; + } + /// \brief Simplify the given expression given this inequality as a guarantee. Result Simplify(Expression expr) { const auto& guarantee = *this; @@ -1258,6 +1325,12 @@ struct Inequality { return call->function_name == "is_valid" ? literal(true) : literal(false); } + if (call->function_name == "is_in") { + ARROW_ASSIGN_OR_RAISE(std::optional result, + SimplifyIsIn(guarantee, call)); + return result.value_or(expr); + } + auto cmp = Comparison::Get(expr); if (!cmp) return expr; diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index d94a17b6ffadf..0b7e8a9c23b13 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -27,6 +27,7 @@ #include #include +#include "arrow/array/builder_primitive.h" #include "arrow/compute/expression_internal.h" #include "arrow/compute/function_internal.h" #include "arrow/compute/registry.h" @@ -1616,6 +1617,144 @@ TEST(Expression, SimplifyWithComparisonAndNullableCaveat) { true_unless_null(field_ref("i32")))); // not satisfiable, will drop row group } +TEST(Expression, SimplifyIsIn) { + auto is_in = [](Expression field, std::shared_ptr value_set_type, + std::string json_array, + SetLookupOptions::NullMatchingBehavior null_matching_behavior) { + SetLookupOptions options{ArrayFromJSON(value_set_type, json_array), + null_matching_behavior}; + return call("is_in", {field}, options); + }; + + for (SetLookupOptions::NullMatchingBehavior null_matching : { + SetLookupOptions::MATCH, + SetLookupOptions::SKIP, + SetLookupOptions::EMIT_NULL, + SetLookupOptions::INCONCLUSIVE, + }) { + Simplify{is_in(field_ref("i32"), int32(), "[]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(equal(field_ref("i32"), literal(6))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(3))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(9))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(0))) + .Expect(false); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("i32"), literal(0))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(less_equal(field_ref("i32"), literal(9))) + .ExpectUnchanged(); + + Simplify{is_in(field_ref("i32"), int32(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(and_(less_equal(field_ref("i32"), literal(7)), + greater(field_ref("i32"), literal(4)))) + .Expect(is_in(field_ref("i32"), int32(), "[5,7]", null_matching)); + + Simplify{is_in(field_ref("u32"), int8(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int8(), "[5,7,9]", null_matching)); + + Simplify{is_in(field_ref("u32"), int64(), "[1,3,5,7,9]", null_matching)} + .WithGuarantee(greater(field_ref("u32"), literal(3))) + .Expect(is_in(field_ref("u32"), int64(), "[5,7,9]", null_matching)); + } + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::MATCH), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3,null]", SetLookupOptions::MATCH)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::SKIP), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::SKIP)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .Expect(is_in(field_ref("i32"), int32(), "[3]", SetLookupOptions::EMIT_NULL)); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::EMIT_NULL), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee(greater(field_ref("i32"), literal(2))) + .ExpectUnchanged(); + + Simplify{ + is_in(field_ref("i32"), int32(), "[1,2,3,null]", SetLookupOptions::INCONCLUSIVE), + } + .WithGuarantee( + or_(greater(field_ref("i32"), literal(2)), is_null(field_ref("i32")))) + .ExpectUnchanged(); +} + TEST(Expression, SimplifyThenExecute) { auto filter = or_({equal(field_ref("f32"), literal(0)), @@ -1643,6 +1782,40 @@ TEST(Expression, SimplifyThenExecute) { AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); } +TEST(Expression, SimplifyIsInThenExecute) { + auto input = RecordBatchFromJSON(kBoringSchema, R"([ + {"i64": 2, "i32": 5}, + {"i64": 5, "i32": 6}, + {"i64": 3, "i32": 6}, + {"i64": 3, "i32": 5}, + {"i64": 4, "i32": 5}, + {"i64": 2, "i32": 7}, + {"i64": 5, "i32": 5} + ])"); + + std::vector guarantees{greater(field_ref("i64"), literal(1)), + greater_equal(field_ref("i32"), literal(5)), + less_equal(field_ref("i64"), literal(5))}; + + for (const Expression& guarantee : guarantees) { + auto filter = + call("is_in", {guarantee.call()->arguments[0]}, + compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2,3]"), true}); + ASSERT_OK_AND_ASSIGN(filter, filter.Bind(*kBoringSchema)); + ASSERT_OK_AND_ASSIGN(auto simplified, SimplifyWithGuarantee(filter, guarantee)); + + Datum evaluated, simplified_evaluated; + ExpectExecute(filter, input, &evaluated); + ExpectExecute(simplified, input, &simplified_evaluated); + if (simplified_evaluated.is_scalar()) { + ASSERT_OK_AND_ASSIGN( + simplified_evaluated, + MakeArrayFromScalar(*simplified_evaluated.scalar(), evaluated.length())); + } + AssertDatumsEqual(evaluated, simplified_evaluated, /*verbose=*/true); + } +} + TEST(Expression, Filter) { auto ExpectFilter = [](Expression filter, std::string batch_json) { ASSERT_OK_AND_ASSIGN(auto s, kBoringSchema->AddField(0, field("in", boolean()))); diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index e1a2e8c5d8879..0478a3d1e801a 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" #include "arrow/util/tracing_internal.h" diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 5c87ef2cd0561..5e7461cc52d0e 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -24,6 +24,7 @@ #include "arrow/buffer.h" #include "arrow/compute/exec.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/result.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 1adb3e96c97c8..cfb6265f12904 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -31,6 +31,7 @@ #include "arrow/buffer.h" #include "arrow/compute/exec.h" #include "arrow/datum.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" @@ -41,7 +42,7 @@ // macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h. // No other BSD seems to do so. The name is used as an identifier in MemAllocation enum. #if defined(__APPLE__) && defined(PREALLOCATE) -#undef PREALLOCATE +# undef PREALLOCATE #endif namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 1fbcd6a249093..b545d8bcc1003 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -23,7 +23,9 @@ #include "arrow/util/cpu_info.h" #include "arrow/util/hashing.h" -#include +// Include templated definitions for aggregate kernels that must compiled here +// with the SIMD level configured for this compilation unit in the build. +#include "arrow/compute/kernels/aggregate_basic.inc.cc" // NOLINT(build/include) namespace arrow { namespace compute { @@ -276,11 +278,6 @@ struct SumImplDefault : public SumImpl { using SumImpl::SumImpl; }; -template -struct MeanImplDefault : public MeanImpl { - using MeanImpl::MeanImpl; -}; - Result> SumInit(KernelContext* ctx, const KernelInitArgs& args) { SumLikeInit visitor( @@ -289,6 +286,14 @@ Result> SumInit(KernelContext* ctx, return visitor.Create(); } +// ---------------------------------------------------------------------- +// Mean implementation + +template +struct MeanImplDefault : public MeanImpl { + using MeanImpl::MeanImpl; +}; + Result> MeanInit(KernelContext* ctx, const KernelInitArgs& args) { MeanKernelInit visitor( @@ -482,8 +487,8 @@ void AddFirstOrLastAggKernel(ScalarAggregateFunction* func, // ---------------------------------------------------------------------- // MinMax implementation -Result> MinMaxInit(KernelContext* ctx, - const KernelInitArgs& args) { +Result> MinMaxInitDefault(KernelContext* ctx, + const KernelInitArgs& args) { ARROW_ASSIGN_OR_RAISE(TypeHolder out_type, args.kernel->signature->out_type().Resolve(ctx, args.inputs)); MinMaxInitState visitor( @@ -532,13 +537,13 @@ struct BooleanAnyImpl : public ScalarAggregator { } if (batch[0].is_scalar()) { const Scalar& scalar = *batch[0].scalar; - this->has_nulls = !scalar.is_valid; - this->any = scalar.is_valid && checked_cast(scalar).value; - this->count += scalar.is_valid; + this->has_nulls |= !scalar.is_valid; + this->any |= scalar.is_valid && checked_cast(scalar).value; + this->count += scalar.is_valid * batch.length; return Status::OK(); } const ArraySpan& data = batch[0].array; - this->has_nulls = data.GetNullCount() > 0; + this->has_nulls |= data.GetNullCount() > 0; this->count += data.length - data.GetNullCount(); arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[0].data, data.offset, data.buffers[1].data, data.offset, @@ -603,13 +608,13 @@ struct BooleanAllImpl : public ScalarAggregator { } if (batch[0].is_scalar()) { const Scalar& scalar = *batch[0].scalar; - this->has_nulls = !scalar.is_valid; - this->count += scalar.is_valid; - this->all = !scalar.is_valid || checked_cast(scalar).value; + this->has_nulls |= !scalar.is_valid; + this->count += scalar.is_valid * batch.length; + this->all &= !scalar.is_valid || checked_cast(scalar).value; return Status::OK(); } const ArraySpan& data = batch[0].array; - this->has_nulls = data.GetNullCount() > 0; + this->has_nulls |= data.GetNullCount() > 0; this->count += data.length - data.GetNullCount(); arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[1].data, data.offset, data.buffers[0].data, data.offset, @@ -1114,14 +1119,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { // Add min max function func = std::make_shared("min_max", Arity::Unary(), min_max_doc, &default_scalar_aggregate_options); - AddMinMaxKernels(MinMaxInit, {null(), boolean()}, func.get()); - AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); - AddMinMaxKernels(MinMaxInit, TemporalTypes(), func.get()); - AddMinMaxKernels(MinMaxInit, BaseBinaryTypes(), func.get()); - AddMinMaxKernel(MinMaxInit, Type::FIXED_SIZE_BINARY, func.get()); - AddMinMaxKernel(MinMaxInit, Type::INTERVAL_MONTHS, func.get()); - AddMinMaxKernel(MinMaxInit, Type::DECIMAL128, func.get()); - AddMinMaxKernel(MinMaxInit, Type::DECIMAL256, func.get()); + AddMinMaxKernels(MinMaxInitDefault, {null(), boolean()}, func.get()); + AddMinMaxKernels(MinMaxInitDefault, NumericTypes(), func.get()); + AddMinMaxKernels(MinMaxInitDefault, TemporalTypes(), func.get()); + AddMinMaxKernels(MinMaxInitDefault, BaseBinaryTypes(), func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::FIXED_SIZE_BINARY, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::INTERVAL_MONTHS, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL128, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL256, func.get()); // Add the SIMD variants for min max #if defined(ARROW_HAVE_RUNTIME_AVX2) if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc new file mode 100644 index 0000000000000..f2151e0a9e029 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc @@ -0,0 +1,1025 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// .inc.cc file to be included in compilation unit where kernels are meant to be +// compiled auto-vectorized by the compiler with different SIMD levels passed +// as compiler flags. +// +// It contains no includes to avoid double inclusion in the compilation unit +// that includes this .inc.cc file. + +#include +#include +#include +#include +#include + +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/align_util.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/decimal.h" + +namespace arrow::compute::internal { +namespace { + +// ---------------------------------------------------------------------- +// Sum implementation + +template ::Type> +struct SumImpl : public ScalarAggregator { + using ThisType = SumImpl; + using CType = typename TypeTraits::CType; + using SumType = ResultType; + using SumCType = typename TypeTraits::CType; + using OutputType = typename TypeTraits::ScalarType; + + SumImpl(std::shared_ptr out_type, ScalarAggregateOptions options_) + : out_type(std::move(out_type)), options(std::move(options_)) {} + + Status Consume(KernelContext*, const ExecSpan& batch) override { + if (batch[0].is_array()) { + const ArraySpan& data = batch[0].array; + this->count += data.length - data.GetNullCount(); + this->nulls_observed = this->nulls_observed || data.GetNullCount(); + + if (!options.skip_nulls && this->nulls_observed) { + // Short-circuit + return Status::OK(); + } + + if (is_boolean_type::value) { + this->sum += GetTrueCount(data); + } else { + this->sum += SumArray(data); + } + } else { + const Scalar& data = *batch[0].scalar; + this->count += data.is_valid * batch.length; + this->nulls_observed = this->nulls_observed || !data.is_valid; + if (data.is_valid) { + this->sum += internal::UnboxScalar::Unbox(data) * batch.length; + } + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->count += other.count; + this->sum += other.sum; + this->nulls_observed = this->nulls_observed || other.nulls_observed; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + if ((!options.skip_nulls && this->nulls_observed) || + (this->count < options.min_count)) { + out->value = std::make_shared(out_type); + } else { + out->value = std::make_shared(this->sum, out_type); + } + return Status::OK(); + } + + size_t count = 0; + bool nulls_observed = false; + SumCType sum = 0; + std::shared_ptr out_type; + ScalarAggregateOptions options; +}; + +template +struct NullImpl : public ScalarAggregator { + using ScalarType = typename TypeTraits::ScalarType; + + explicit NullImpl(const ScalarAggregateOptions& options_) : options(options_) {} + + Status Consume(KernelContext*, const ExecSpan& batch) override { + if (batch[0].is_scalar() || batch[0].array.GetNullCount() > 0) { + // If the batch is a scalar or an array with elements, set is_empty to false + is_empty = false; + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->is_empty &= other.is_empty; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + if ((options.skip_nulls || this->is_empty) && options.min_count == 0) { + // Return 0 if the remaining data is empty + out->value = output_empty(); + } else { + out->value = MakeNullScalar(TypeTraits::type_singleton()); + } + return Status::OK(); + } + + virtual std::shared_ptr output_empty() = 0; + + bool is_empty = true; + ScalarAggregateOptions options; +}; + +template +struct NullSumImpl : public NullImpl { + using ScalarType = typename TypeTraits::ScalarType; + + explicit NullSumImpl(const ScalarAggregateOptions& options_) + : NullImpl(options_) {} + + std::shared_ptr output_empty() override { + return std::make_shared(0); + } +}; + +template