diff --git a/.clang-format b/.clang-format index 9448dc8d8c80d..abd823c103904 100644 --- a/.clang-format +++ b/.clang-format @@ -19,3 +19,4 @@ BasedOnStyle: Google ColumnLimit: 90 DerivePointerAlignment: false IncludeBlocks: Preserve +IndentPPDirectives: AfterHash diff --git a/.dockerignore b/.dockerignore index 3791cca95e3fe..1f1715d8e833d 100644 --- a/.dockerignore +++ b/.dockerignore @@ -27,11 +27,11 @@ # include explicitly !ci/** !c_glib/Gemfile -!dev/archery/setup.py !dev/release/setup-*.sh !docs/requirements*.txt +!go/go.mod +!go/go.sum !python/requirements*.txt -!python/manylinux1/** !r/DESCRIPTION !ruby/Gemfile !ruby/red-arrow/Gemfile @@ -46,20 +46,3 @@ !ruby/red-parquet/Gemfile !ruby/red-parquet/lib/parquet/version.rb !ruby/red-parquet/red-parquet.gemspec -!ruby/red-plasma/Gemfile -!ruby/red-plasma/lib/plasma/version.rb -!ruby/red-plasma/red-plasma.gemspec -!rust/Cargo.toml -!rust/benchmarks/Cargo.toml -!rust/arrow/Cargo.toml -!rust/arrow/benches -!rust/arrow-flight/Cargo.toml -!rust/parquet/Cargo.toml -!rust/parquet/build.rs -!rust/parquet_derive/Cargo.toml -!rust/parquet_derive_test/Cargo.toml -!rust/datafusion/Cargo.toml -!rust/datafusion/benches -!rust/integration-testing/Cargo.toml -!go/go.mod -!go/go.sum \ No newline at end of file diff --git a/.env b/.env index af647fc8b7a7f..c8c236d5ac44b 100644 --- a/.env +++ b/.env @@ -71,6 +71,7 @@ NUMBA=latest NUMPY=latest PANDAS=latest PYTHON=3.8 +PYTHON_IMAGE_TAG=3.8 R=4.4 SPARK=master TURBODBC=latest diff --git a/.github/workflows/archery.yml b/.github/workflows/archery.yml index b016f7d11b9fa..e448209056d78 100644 --- a/.github/workflows/archery.yml +++ b/.github/workflows/archery.yml @@ -20,12 +20,14 @@ name: Archery & Crossbow on: push: paths: + - '.dockerignore' - '.github/workflows/archery.yml' - 'dev/archery/**' - 'dev/tasks/**' - 'docker-compose.yml' pull_request: paths: + - '.dockerignore' - '.github/workflows/archery.yml' - 'dev/archery/**' - 'dev/tasks/**' @@ -58,7 +60,7 @@ jobs: shell: bash run: git branch $ARCHERY_DEFAULT_BRANCH origin/$ARCHERY_DEFAULT_BRANCH || true - name: Setup Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 with: python-version: '3.9' - name: Install pygit2 binary wheel diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml index 1138c0a02f812..b7af4c5800835 100644 --- a/.github/workflows/comment_bot.yml +++ b/.github/workflows/comment_bot.yml @@ -41,7 +41,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index fd23e0cf217e6..4a01d2f8e3aab 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -20,6 +20,7 @@ name: C++ on: push: paths: + - '.dockerignore' - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' @@ -35,6 +36,7 @@ on: - 'testing' pull_request: paths: + - '.dockerignore' - '.github/workflows/cpp.yml' - 'ci/conda_env_*' - 'ci/docker/**' @@ -243,7 +245,7 @@ jobs: $(brew --prefix bash)/bin/bash \ ci/scripts/install_minio.sh latest ${ARROW_HOME} - name: Set up Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 with: python-version: 3.12 - name: Install Google Cloud Storage Testbench @@ -409,12 +411,10 @@ jobs: ARROW_WITH_SNAPPY: ON ARROW_WITH_ZLIB: ON ARROW_WITH_ZSTD: ON - # Don't use preinstalled Boost by empty BOOST_ROOT and - # -DBoost_NO_BOOST_CMAKE=ON + # Don't use preinstalled Boost by empty BOOST_ROOT BOOST_ROOT: "" ARROW_CMAKE_ARGS: >- -DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}} - -DBoost_NO_BOOST_CMAKE=ON -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON # We can't use unity build because we don't have enough memory on # GitHub Actions. @@ -464,7 +464,7 @@ jobs: https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2022-05-26T05-48-41Z chmod +x /usr/local/bin/minio.exe - name: Set up Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 id: python-install with: python-version: 3.9 @@ -472,7 +472,7 @@ jobs: shell: msys2 {0} env: PIPX_BIN_DIR: /usr/local/bin - PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }} + PIPX_BASE_PYTHON: ${{ steps.python-install.outputs.python-path }} run: | ci/scripts/install_gcs_testbench.sh default - name: Test diff --git a/.github/workflows/csharp.yml b/.github/workflows/csharp.yml index 6e8548dc960f4..c618350affbeb 100644 --- a/.github/workflows/csharp.yml +++ b/.github/workflows/csharp.yml @@ -108,7 +108,7 @@ jobs: with: dotnet-version: ${{ matrix.dotnet }} - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Checkout Arrow diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index cc3ff6330746d..1cc8d993498b6 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -45,7 +45,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install pre-commit @@ -104,7 +104,7 @@ jobs: with: fetch-depth: 0 - name: Install Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: '3.12' - name: Install Ruby diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 25db1c39ad89e..1219f7526f9f2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -52,7 +52,7 @@ jobs: key: debian-docs-${{ hashFiles('cpp/**') }} restore-keys: debian-docs- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index ea7fe5d02d7b8..7d540b7cecdc9 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -20,6 +20,7 @@ name: Docs on: pull_request: paths: + - '.dockerignore' - 'docs/**' - '.github/workflows/docs_light.yml' - 'ci/docker/conda.dockerfile' @@ -58,7 +59,7 @@ jobs: key: conda-docs-${{ hashFiles('cpp/**') }} restore-keys: conda-docs- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Setup Archery diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index b9a19d182d5c4..d463549206471 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -20,6 +20,7 @@ name: Go on: push: paths: + - '.dockerignore' - '.github/workflows/go.yml' - 'ci/docker/*_go.dockerfile' - 'ci/scripts/go_*' @@ -27,6 +28,7 @@ on: - 'go/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/go.yml' - 'ci/docker/*_go.dockerfile' - 'ci/docker/**' @@ -207,7 +209,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -247,7 +249,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -339,7 +341,7 @@ jobs: github.event_name == 'push' && github.repository == 'apache/arrow' && github.ref_name == 'main' - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: '3.10' - name: Run Benchmarks diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 43f8af0a600d8..ecf89bff8f600 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -20,6 +20,7 @@ name: Integration on: push: paths: + - '.dockerignore' - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' @@ -33,6 +34,7 @@ on: - 'format/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/integration.yml' - 'ci/**' - 'dev/archery/**' @@ -89,7 +91,7 @@ jobs: key: conda-${{ hashFiles('cpp/**') }} restore-keys: conda- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 0317879b580ba..57f834bcbabee 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -20,6 +20,7 @@ name: Java on: push: paths: + - '.dockerignore' - '.github/workflows/java.yml' - 'ci/docker/*java*' - 'ci/scripts/java*.sh' @@ -29,6 +30,7 @@ on: - 'java/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/java.yml' - 'ci/docker/*java*' - 'ci/scripts/java*.sh' @@ -76,7 +78,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index c2bc679e681a2..f2ecc801dc724 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -20,6 +20,7 @@ name: Java JNI on: push: paths: + - '.dockerignore' - '.github/workflows/java_jni.yml' - 'ci/docker/**' - 'ci/scripts/cpp_build.sh' @@ -29,6 +30,7 @@ on: - 'java/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/java_jni.yml' - 'ci/docker/**' - 'ci/scripts/cpp_build.sh' @@ -70,7 +72,7 @@ jobs: key: java-jni-manylinux-2014-${{ hashFiles('cpp/**', 'java/**') }} restore-keys: java-jni-manylinux-2014- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -110,7 +112,7 @@ jobs: key: maven-${{ hashFiles('java/**') }} restore-keys: maven- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/java_nightly.yml b/.github/workflows/java_nightly.yml index 72afb6dbf1c1d..0bf0c27288faf 100644 --- a/.github/workflows/java_nightly.yml +++ b/.github/workflows/java_nightly.yml @@ -58,7 +58,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/js.yml b/.github/workflows/js.yml index 630bef61105f6..17b57c42b62f6 100644 --- a/.github/workflows/js.yml +++ b/.github/workflows/js.yml @@ -20,12 +20,14 @@ name: NodeJS on: push: paths: + - '.dockerignore' - '.github/workflows/js.yml' - 'ci/docker/*js.dockerfile' - 'ci/scripts/js_*' - 'js/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/js.yml' - 'ci/docker/*js.dockerfile' - 'ci/scripts/js_*' @@ -54,7 +56,7 @@ jobs: with: fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery diff --git a/.github/workflows/pr_bot.yml b/.github/workflows/pr_bot.yml index 7dd06b6aeec09..bbb1a2d7228d0 100644 --- a/.github/workflows/pr_bot.yml +++ b/.github/workflows/pr_bot.yml @@ -82,7 +82,7 @@ jobs: # fetch the tags for version number generation fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.12 - name: Install Archery and Crossbow dependencies diff --git a/.github/workflows/pr_review_trigger.yml b/.github/workflows/pr_review_trigger.yml index 0cd89b3206715..68f922ce8b4d9 100644 --- a/.github/workflows/pr_review_trigger.yml +++ b/.github/workflows/pr_review_trigger.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Upload PR review Payload" - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4.4.0 with: path: "${{ github.event_path }}" name: "pr_review_payload" diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 90d3a50af3705..6e83b727593b4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -20,6 +20,7 @@ name: Python on: push: paths: + - '.dockerignore' - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' @@ -27,6 +28,7 @@ on: - 'python/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/python.yml' - 'ci/**' - 'cpp/**' @@ -107,7 +109,7 @@ jobs: key: ${{ matrix.cache }}-${{ hashFiles('cpp/**') }} restore-keys: ${{ matrix.cache }}- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -177,7 +179,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@v5.1.1 + uses: actions/setup-python@v5.2.0 with: python-version: '3.11' - name: Install Dependencies diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 2820d42470bca..fbc2ebe0bc5f1 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -20,6 +20,7 @@ name: R on: push: paths: + - '.dockerignore' - ".github/workflows/r.yml" - "ci/docker/**" - "ci/etc/rprofile" @@ -32,6 +33,7 @@ on: - "r/**" pull_request: paths: + - '.dockerignore' - ".github/workflows/r.yml" - "ci/docker/**" - "ci/etc/rprofile" @@ -146,7 +148,7 @@ jobs: ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}-${{ hashFiles('cpp/src/**/*.cc','cpp/src/**/*.h)') }}- ubuntu-${{ matrix.ubuntu }}-r-${{ matrix.r }}- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -169,9 +171,9 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: test-output + name: test-output-${{ matrix.ubuntu }}-${{ matrix.r }} path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: >- @@ -206,7 +208,7 @@ jobs: fetch-depth: 0 submodules: recursive - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -230,9 +232,9 @@ jobs: if: always() - name: Save the test output if: always() - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: - name: test-output + name: test-output-bundled path: r/check/arrow.Rcheck/tests/testthat.Rout* - name: Docker Push if: >- @@ -292,7 +294,7 @@ jobs: # So that they're unique when multiple are downloaded in the next step shell: bash run: mv libarrow.zip libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # # v4.0.0 with: name: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip path: libarrow-rtools${{ matrix.config.rtools }}-${{ matrix.config.arch }}.zip @@ -330,7 +332,7 @@ jobs: echo "$HOME/.local/bin" >> $GITHUB_PATH - run: mkdir r/windows - name: Download artifacts - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4.1.7 with: name: libarrow-rtools40-ucrt64.zip path: r/windows diff --git a/.github/workflows/r_nightly.yml b/.github/workflows/r_nightly.yml index 1ec071b6bbb5e..9817e41d3b61d 100644 --- a/.github/workflows/r_nightly.yml +++ b/.github/workflows/r_nightly.yml @@ -60,7 +60,7 @@ jobs: repository: ursacomputing/crossbow ref: main - name: Set up Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: cache: 'pip' python-version: 3.12 diff --git a/.github/workflows/ruby.yml b/.github/workflows/ruby.yml index e4d650e74a8ad..c4a7f31f4a94c 100644 --- a/.github/workflows/ruby.yml +++ b/.github/workflows/ruby.yml @@ -20,6 +20,7 @@ name: C GLib & Ruby on: push: paths: + - '.dockerignore' - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' @@ -33,6 +34,7 @@ on: - 'ruby/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/ruby.yml' - 'ci/docker/**' - 'ci/scripts/c_glib_*' @@ -83,7 +85,7 @@ jobs: key: ubuntu-${{ matrix.ubuntu }}-ruby-${{ hashFiles('cpp/**') }} restore-keys: ubuntu-${{ matrix.ubuntu }}-ruby- - name: Setup Python - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: 3.8 - name: Setup Archery @@ -406,7 +408,10 @@ jobs: -source "https://nuget.pkg.github.com/$GITHUB_REPOSITORY_OWNER/index.json" - name: Build C++ vcpkg dependencies run: | - vcpkg\vcpkg.exe install --triplet $env:VCPKG_TRIPLET --x-manifest-root cpp --x-install-root build\cpp\vcpkg_installed + vcpkg\vcpkg.exe install ` + --triplet $env:VCPKG_TRIPLET ` + --x-manifest-root cpp ` + --x-install-root build\cpp\vcpkg_installed - name: Build C++ shell: cmd run: | diff --git a/.github/workflows/swift.yml b/.github/workflows/swift.yml index 1b3c9eca1814a..86eb113dfc833 100644 --- a/.github/workflows/swift.yml +++ b/.github/workflows/swift.yml @@ -20,6 +20,7 @@ name: Swift on: push: paths: + - '.dockerignore' - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' @@ -27,6 +28,7 @@ on: - 'swift/**' pull_request: paths: + - '.dockerignore' - '.github/workflows/swift.yml' - 'ci/docker/*swift*' - 'ci/scripts/swift_*' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf0bcde14622a..91017969eb502 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -78,6 +78,26 @@ repos: ?^cpp/src/generated/| ?^cpp/thirdparty/| ) + - repo: https://github.com/cpplint/cpplint + rev: 1.6.1 + hooks: + - id: cpplint + name: C++ Lint + args: + - "--verbose=2" + types_or: + - c++ + files: >- + ^cpp/ + exclude: >- + ( + ?\.grpc\.fb\.(cc|h)$| + ?\.pb\.(cc|h)$| + ?_generated.*\.(cc|h)$| + ?^cpp/src/arrow/vendored/| + ?^cpp/src/generated/| + ?^cpp/thirdparty/| + ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v14.0.6 hooks: diff --git a/CPPLINT.cfg b/CPPLINT.cfg new file mode 100644 index 0000000000000..2f47b4dbf57b7 --- /dev/null +++ b/CPPLINT.cfg @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +filter = -build/c++11 +filter = -build/header_guard +filter = -build/include_order +filter = -build/include_what_you_use +filter = -readability/alt_tokens +# readability/casting is disabled as it aggressively warns about +# functions with names like "int32", so "int32(x)", where int32 is a +# function name, warns with +filter = -readability/casting +filter = -readability/todo +filter = -runtime/references +filter = -whitespace/comments +linelength = 90 diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index eb035d887a158..f0084894e19dc 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -44,7 +44,7 @@ RUN mamba install -q -y \ # We want to install the GCS testbench using the Conda base environment's Python, # because the test environment's Python may later change. -ENV PIPX_PYTHON=/opt/conda/bin/python3 +ENV PIPX_BASE_PYTHON=/opt/conda/bin/python3 COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts RUN /arrow/ci/scripts/install_gcs_testbench.sh default diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile index 443ff9c53cbcb..09883f9780a36 100644 --- a/ci/docker/python-wheel-manylinux-test.dockerfile +++ b/ci/docker/python-wheel-manylinux-test.dockerfile @@ -19,13 +19,19 @@ ARG arch ARG python_image_tag FROM ${arch}/python:${python_image_tag} -# RUN pip install --upgrade pip - # pandas doesn't provide wheel for aarch64 yet, so cache the compiled # test dependencies in a docker image COPY python/requirements-wheel-test.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-test.txt +# Install the GCS testbench with the system Python +RUN apt-get update -y -q && \ + apt-get install -y -q \ + build-essential \ + python3-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists* + COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ -ARG python -RUN PYTHON_VERSION=${python} /arrow/ci/scripts/install_gcs_testbench.sh default +ENV PIPX_PYTHON=/usr/bin/python3 PIPX_PIP_ARGS=--prefer-binary +RUN /arrow/ci/scripts/install_gcs_testbench.sh default diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 42f088fd8a22a..5cc1711608c03 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -100,6 +100,9 @@ RUN vcpkg install \ --x-feature=parquet \ --x-feature=s3 +# Make sure auditwheel is up-to-date +RUN pipx upgrade auditwheel + # Configure Python for applications running in the bash shell of this Dockerfile ARG python=3.8 ENV PYTHON_VERSION=${python} diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 78826e94d3294..48a5858a358c9 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -39,18 +39,21 @@ if [[ "${version}" -eq "default" ]]; then version="v0.39.0" fi -: ${PIPX_PYTHON:=$(which python3)} +# The Python to install pipx with +: ${PIPX_BASE_PYTHON:=$(which python3)} +# The Python to install the GCS testbench with +: ${PIPX_PYTHON:=${PIPX_BASE_PYTHON:-$(which python3)}} export PIP_BREAK_SYSTEM_PACKAGES=1 -${PIPX_PYTHON} -m pip install -U pipx +${PIPX_BASE_PYTHON} -m pip install -U pipx -# This script is run with PYTHON undefined in some places, -# but those only use older pythons. -if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then - pipx_flags=--verbose - if [[ $(id -un) == "root" ]]; then - # Install globally as /root/.local/bin is typically not in $PATH - pipx_flags="${pipx_flags} --global" - fi - ${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" +pipx_flags=(--verbose --python ${PIPX_PYTHON}) +if [[ $(id -un) == "root" ]]; then + # Install globally as /root/.local/bin is typically not in $PATH + pipx_flags+=(--global) fi +if [[ -n "${PIPX_PIP_ARGS}" ]]; then + pipx_flags+=(--pip-args "'${PIPX_PIP_ARGS}'") +fi +${PIPX_BASE_PYTHON} -m pipx install ${pipx_flags[@]} \ + "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 92b962f1740bd..d2c392e6b9db3 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -150,7 +150,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index aa86494a9d47d..885019ff3049f 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -140,7 +140,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} @@ -181,5 +180,5 @@ popd rm -rf dist/temp-fix-wheel echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ===" -auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels +auditwheel repair dist/pyarrow-*.whl -w repaired_wheels popd diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index cf87a17056783..6bdc3d3621e14 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -34,6 +34,7 @@ source_dir=${1} : ${ARROW_S3:=ON} : ${ARROW_SUBSTRAIT:=ON} : ${CHECK_IMPORTS:=ON} +: ${CHECK_WHEEL_CONTENT:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} @@ -87,6 +88,11 @@ import pyarrow.parquet fi fi +if [ "${CHECK_WHEEL_CONTENT}" == "ON" ]; then + python ${source_dir}/ci/scripts/python_wheel_validate_contents.py \ + --path ${source_dir}/python/repaired_wheels +fi + if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Install testing dependencies pip install -U -r ${source_dir}/python/requirements-wheel-test.txt diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py new file mode 100644 index 0000000000000..22b3a890f036b --- /dev/null +++ b/ci/scripts/python_wheel_validate_contents.py @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +from pathlib import Path +import re +import zipfile + + +def validate_wheel(path): + p = Path(path) + wheels = list(p.glob('*.whl')) + error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" + assert len(wheels) == 1, error_msg + f = zipfile.ZipFile(wheels[0]) + outliers = [ + info.filename for info in f.filelist if not re.match( + r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/)', info.filename + ) + ] + assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + print(f"The wheel: {wheels[0]} seems valid.") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--path", type=str, required=True, + help="Directory where wheel is located") + args = parser.parse_args() + validate_wheel(args.path) + + +if __name__ == '__main__': + main() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 54f02ec6f6ed0..1f1d5dca721d9 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -106,7 +106,6 @@ echo "=== (%PYTHON_VERSION%) Building wheel ===" set PYARROW_BUILD_TYPE=%CMAKE_BUILD_TYPE% set PYARROW_BUNDLE_ARROW_CPP=ON set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR% -set PYARROW_INSTALL_TESTS=ON set PYARROW_WITH_ACERO=%ARROW_ACERO% set PYARROW_WITH_DATASET=%ARROW_DATASET% set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT% diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index cac3f18434b6c..de5a2c2e965cb 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -64,6 +64,9 @@ set PYTHON_CMD=py -%PYTHON% %PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1 %PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1 +@REM Validate wheel contents +%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\dist || exit /B 1 + @rem Download IANA Timezone Database for ORC C++ curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B mkdir %USERPROFILE%\Downloads\test\tzdata diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5ead9e4b063cd..423744c388471 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -301,7 +301,8 @@ add_custom_target(lint --cpplint_binary ${CPPLINT_BIN} ${COMMON_LINT_OPTIONS} - ${ARROW_LINT_QUIET}) + ${ARROW_LINT_QUIET} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) # # "make format" and "make check-format" targets diff --git a/cpp/build-support/run_cpplint.py b/cpp/build-support/run_cpplint.py index 76c0fe0aefaca..a81acf2eb2ff9 100755 --- a/cpp/build-support/run_cpplint.py +++ b/cpp/build-support/run_cpplint.py @@ -26,24 +26,6 @@ from functools import partial -# NOTE(wesm): -# -# * readability/casting is disabled as it aggressively warns about functions -# with names like "int32", so "int32(x)", where int32 is a function name, -# warns with -_filters = ''' --whitespace/comments --readability/casting --readability/todo --readability/alt_tokens --build/header_guard --build/c++11 --build/include_what_you_use --runtime/references --build/include_order -'''.split() - - def _get_chunk_key(filenames): # lists are not hashable so key on the first filename in a chunk return filenames[0] @@ -87,8 +69,6 @@ def _check_some_files(completed_processes, filenames): cmd = [ arguments.cpplint_binary, '--verbose=2', - '--linelength=90', - '--filter=' + ','.join(_filters) ] if (arguments.cpplint_binary.endswith('.py') and platform.system() == 'Windows'): diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 63e2c036c9a6f..b31037a973279 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -259,7 +259,7 @@ macro(resolve_dependency DEPENDENCY_NAME) IS_RUNTIME_DEPENDENCY REQUIRED_VERSION USE_CONFIG) - set(multi_value_args COMPONENTS PC_PACKAGE_NAMES) + set(multi_value_args COMPONENTS OPTIONAL_COMPONENTS PC_PACKAGE_NAMES) cmake_parse_arguments(ARG "${options}" "${one_value_args}" @@ -287,6 +287,9 @@ macro(resolve_dependency DEPENDENCY_NAME) if(ARG_COMPONENTS) list(APPEND FIND_PACKAGE_ARGUMENTS COMPONENTS ${ARG_COMPONENTS}) endif() + if(ARG_OPTIONAL_COMPONENTS) + list(APPEND FIND_PACKAGE_ARGUMENTS OPTIONAL_COMPONENTS ${ARG_OPTIONAL_COMPONENTS}) + endif() if(${DEPENDENCY_NAME}_SOURCE STREQUAL "AUTO") find_package(${FIND_PACKAGE_ARGUMENTS}) set(COMPATIBLE ${${PACKAGE_NAME}_FOUND}) @@ -1289,15 +1292,19 @@ if(ARROW_USE_BOOST) set(Boost_USE_STATIC_LIBS ON) endif() if(ARROW_BOOST_REQUIRE_LIBRARY) - set(ARROW_BOOST_COMPONENTS system filesystem) + set(ARROW_BOOST_COMPONENTS filesystem system) + set(ARROW_BOOST_OPTIONAL_COMPONENTS process) else() set(ARROW_BOOST_COMPONENTS) + set(ARROW_BOOST_OPTIONAL_COMPONENTS) endif() resolve_dependency(Boost REQUIRED_VERSION ${ARROW_BOOST_REQUIRED_VERSION} COMPONENTS ${ARROW_BOOST_COMPONENTS} + OPTIONAL_COMPONENTS + ${ARROW_BOOST_OPTIONAL_COMPONENTS} IS_RUNTIME_DEPENDENCY # libarrow.so doesn't depend on libboost*. FALSE) @@ -1316,14 +1323,35 @@ if(ARROW_USE_BOOST) endif() endforeach() - if(WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - # boost/process/detail/windows/handle_workaround.hpp doesn't work - # without BOOST_USE_WINDOWS_H with MinGW because MinGW doesn't - # provide __kernel_entry without winternl.h. - # - # See also: - # https://github.com/boostorg/process/blob/develop/include/boost/process/detail/windows/handle_workaround.hpp - target_compile_definitions(Boost::headers INTERFACE "BOOST_USE_WINDOWS_H=1") + if(TARGET Boost::process) + # Boost >= 1.86 + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V1") + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") + else() + # Boost < 1.86 + add_library(Boost::process INTERFACE IMPORTED) + if(TARGET Boost::filesystem) + target_link_libraries(Boost::process INTERFACE Boost::filesystem) + endif() + if(TARGET Boost::system) + target_link_libraries(Boost::process INTERFACE Boost::system) + endif() + if(TARGET Boost::headers) + target_link_libraries(Boost::process INTERFACE Boost::headers) + endif() + if(Boost_VERSION VERSION_GREATER_EQUAL 1.80) + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_HAVE_V2") + # Boost < 1.86 has a bug that + # boost::process::v2::process_environment::on_setup() isn't + # defined. We need to build Boost Process source to define it. + # + # See also: + # https://github.com/boostorg/process/issues/312 + target_compile_definitions(Boost::process INTERFACE "BOOST_PROCESS_NEED_SOURCE") + if(WIN32) + target_link_libraries(Boost::process INTERFACE bcrypt ntdll) + endif() + endif() endif() message(STATUS "Boost include dir: ${Boost_INCLUDE_DIRS}") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 65343df1291ba..01ac813f4713b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -644,9 +644,13 @@ else() endif() set(ARROW_TESTING_SHARED_LINK_LIBS arrow_shared ${ARROW_GTEST_GTEST}) -set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON) -set(ARROW_TESTING_STATIC_LINK_LIBS arrow::flatbuffers RapidJSON arrow_static - ${ARROW_GTEST_GTEST}) +set(ARROW_TESTING_SHARED_PRIVATE_LINK_LIBS arrow::flatbuffers RapidJSON Boost::process) +set(ARROW_TESTING_STATIC_LINK_LIBS + arrow::flatbuffers + RapidJSON + Boost::process + arrow_static + ${ARROW_GTEST_GTEST}) set(ARROW_TESTING_SHARED_INSTALL_INTERFACE_LIBS Arrow::arrow_shared) set(ARROW_TESTING_STATIC_INSTALL_INTERFACE_LIBS Arrow::arrow_static) # that depend on gtest @@ -667,9 +671,10 @@ set(ARROW_TESTING_SRCS io/test_common.cc ipc/test_common.cc testing/fixed_width_test_util.cc + testing/generator.cc testing/gtest_util.cc + testing/process.cc testing/random.cc - testing/generator.cc testing/util.cc) # diff --git a/cpp/src/arrow/acero/aggregate_benchmark.cc b/cpp/src/arrow/acero/aggregate_benchmark.cc index 854862e3e48ca..c0dfba66336af 100644 --- a/cpp/src/arrow/acero/aggregate_benchmark.cc +++ b/cpp/src/arrow/acero/aggregate_benchmark.cc @@ -165,11 +165,11 @@ struct SumSentinelUnrolled : public Summer { static void Sum(const ArrayType& array, SumState* state) { SumState local; -#define SUM_NOT_NULL(ITEM) \ - do { \ - local.total += values[i + ITEM] * Traits::NotNull(values[i + ITEM]); \ - local.valid_count++; \ - } while (0) +# define SUM_NOT_NULL(ITEM) \ + do { \ + local.total += values[i + ITEM] * Traits::NotNull(values[i + ITEM]); \ + local.valid_count++; \ + } while (0) const auto values = array.raw_values(); const auto length = array.length(); @@ -185,7 +185,7 @@ struct SumSentinelUnrolled : public Summer { SUM_NOT_NULL(7); } -#undef SUM_NOT_NULL +# undef SUM_NOT_NULL for (int64_t i = length_rounded * 8; i < length; ++i) { local.total += values[i] * Traits::NotNull(values[i]); @@ -256,7 +256,7 @@ struct SumBitmapVectorizeUnroll : public Summer { for (int64_t i = 0; i < length_rounded; i += 8) { const uint8_t valid_byte = bitmap[i / 8]; -#define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1)) +# define SUM_SHIFT(ITEM) (values[i + ITEM] * ((valid_byte >> ITEM) & 1)) if (valid_byte < 0xFF) { // Some nulls @@ -277,7 +277,7 @@ struct SumBitmapVectorizeUnroll : public Summer { } } -#undef SUM_SHIFT +# undef SUM_SHIFT for (int64_t i = length_rounded; i < length; ++i) { if (bit_util::GetBit(bitmap, i)) { diff --git a/cpp/src/arrow/acero/aggregate_node_test.cc b/cpp/src/arrow/acero/aggregate_node_test.cc index d398fb24b73d5..c623271db9fb4 100644 --- a/cpp/src/arrow/acero/aggregate_node_test.cc +++ b/cpp/src/arrow/acero/aggregate_node_test.cc @@ -210,5 +210,57 @@ TEST(GroupByNode, NoSkipNulls) { AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, out_batches.batches); } +TEST(ScalarAggregateNode, AnyAll) { + // GH-43768: boolean_any and boolean_all with constant input should work well + // when min_count != 0. + std::shared_ptr in_schema = schema({field("not_used", int32())}); + std::shared_ptr out_schema = schema({field("agg_out", boolean())}); + struct AnyAllCase { + std::string batches_json; + Expression literal; + std::string expected_json; + bool skip_nulls = false; + uint32_t min_count = 2; + }; + std::vector cases{ + {"[[42], [42], [42], [42]]", literal(true), "[[true]]"}, + {"[[42], [42], [42], [42]]", literal(false), "[[false]]"}, + {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]"}, + {"[[42]]", literal(true), "[[null]]"}, + {"[[42], [42], [42]]", literal(true), "[[true]]"}, + {"[[42], [42], [42]]", literal(true), "[[null]]", /*skip_nulls=*/false, + /*min_count=*/4}, + {"[[42], [42], [42], [42]]", literal(BooleanScalar{}), "[[null]]", + /*skip_nulls=*/true}, + }; + for (const AnyAllCase& any_all_case : cases) { + for (auto func_name : {"any", "all"}) { + std::vector batches{ + ExecBatchFromJSON({int32()}, any_all_case.batches_json)}; + std::vector aggregates = { + Aggregate(func_name, + std::make_shared( + /*skip_nulls=*/any_all_case.skip_nulls, + /*min_count=*/any_all_case.min_count), + FieldRef("literal"))}; + + // And a projection to make the input including a Scalar Boolean + Declaration plan = Declaration::Sequence( + {{"exec_batch_source", ExecBatchSourceNodeOptions(in_schema, batches)}, + {"project", ProjectNodeOptions({any_all_case.literal}, {"literal"})}, + {"aggregate", AggregateNodeOptions(aggregates)}}); + + ASSERT_OK_AND_ASSIGN(BatchesWithCommonSchema out_batches, + DeclarationToExecBatches(plan)); + + ExecBatch expected_batch = + ExecBatchFromJSON({boolean()}, any_all_case.expected_json); + + AssertExecBatchesEqualIgnoringOrder(out_schema, {expected_batch}, + out_batches.batches); + } + } +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 2248362241cd7..c4f11d01f3d5c 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -34,7 +34,7 @@ #include "arrow/acero/options.h" #include "arrow/acero/unmaterialized_table_internal.h" #ifndef NDEBUG -#include "arrow/acero/options_internal.h" +# include "arrow/acero/options_internal.h" #endif #include "arrow/acero/query_context.h" #include "arrow/acero/schema_util.h" @@ -42,7 +42,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #ifndef NDEBUG -#include "arrow/compute/function_internal.h" +# include "arrow/compute/function_internal.h" #endif #include "arrow/acero/time_series_util.h" #include "arrow/compute/key_hash_internal.h" @@ -207,16 +207,16 @@ class DebugSync { std::unique_lock debug_lock_; }; -#define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__) -#define DEBUG_MANIP(manip) \ - DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; }) -#define NDEBUG_EXPLICIT -#define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__ +# define DEBUG_SYNC(node, ...) DebugSync(node).insert(__VA_ARGS__) +# define DEBUG_MANIP(manip) \ + DebugSync::Manip([](DebugSync& d) -> DebugSync& { return d << manip; }) +# define NDEBUG_EXPLICIT +# define DEBUG_ADD(ndebug, ...) ndebug, __VA_ARGS__ #else -#define DEBUG_SYNC(...) -#define DEBUG_MANIP(...) -#define NDEBUG_EXPLICIT explicit -#define DEBUG_ADD(ndebug, ...) ndebug +# define DEBUG_SYNC(...) +# define DEBUG_MANIP(...) +# define NDEBUG_EXPLICIT explicit +# define DEBUG_ADD(ndebug, ...) ndebug #endif struct MemoStore { diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index 555f580028fac..5d3e9fba08bbf 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -26,13 +26,13 @@ #include "arrow/acero/exec_plan.h" #include "arrow/testing/future_util.h" #ifndef NDEBUG -#include +# include #endif #include #include "arrow/acero/options.h" #ifndef NDEBUG -#include "arrow/acero/options_internal.h" +# include "arrow/acero/options_internal.h" #endif #include "arrow/acero/map_node.h" #include "arrow/acero/query_context.h" diff --git a/cpp/src/arrow/acero/bloom_filter.h b/cpp/src/arrow/acero/bloom_filter.h index 50d07bfd948e0..530beaea64827 100644 --- a/cpp/src/arrow/acero/bloom_filter.h +++ b/cpp/src/arrow/acero/bloom_filter.h @@ -18,7 +18,7 @@ #pragma once #if defined(ARROW_HAVE_RUNTIME_AVX2) -#include +# include #endif #include diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index a2d6e9575a1aa..30cafd120caea 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -503,9 +503,9 @@ TEST(BloomFilter, Scaling) { num_build.push_back(4000000); std::vector strategies; -#ifdef ARROW_ENABLE_THREADING +# ifdef ARROW_ENABLE_THREADING strategies.push_back(BloomFilterBuildStrategy::PARALLEL); -#endif +# endif strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); for (const auto hardware_flags : HardwareFlagsForTesting()) { diff --git a/cpp/src/arrow/acero/options_internal.h b/cpp/src/arrow/acero/options_internal.h index d4bf79a7cd008..fd3ea78116572 100644 --- a/cpp/src/arrow/acero/options_internal.h +++ b/cpp/src/arrow/acero/options_internal.h @@ -18,8 +18,8 @@ #pragma once #ifndef NDEBUG -#include -#include +# include +# include #endif namespace arrow { diff --git a/cpp/src/arrow/acero/visibility.h b/cpp/src/arrow/acero/visibility.h index 02382232b69dd..21a697a56eca9 100644 --- a/cpp/src/arrow/acero/visibility.h +++ b/cpp/src/arrow/acero/visibility.h @@ -20,31 +20,31 @@ #pragma once #if defined(_WIN32) || defined(__CYGWIN__) -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable : 4251) -#else -#pragma GCC diagnostic ignored "-Wattributes" -#endif +# if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable : 4251) +# else +# pragma GCC diagnostic ignored "-Wattributes" +# endif -#ifdef ARROW_ACERO_STATIC -#define ARROW_ACERO_EXPORT -#elif defined(ARROW_ACERO_EXPORTING) -#define ARROW_ACERO_EXPORT __declspec(dllexport) -#else -#define ARROW_ACERO_EXPORT __declspec(dllimport) -#endif +# ifdef ARROW_ACERO_STATIC +# define ARROW_ACERO_EXPORT +# elif defined(ARROW_ACERO_EXPORTING) +# define ARROW_ACERO_EXPORT __declspec(dllexport) +# else +# define ARROW_ACERO_EXPORT __declspec(dllimport) +# endif -#define ARROW_ACERO_NO_EXPORT +# define ARROW_ACERO_NO_EXPORT #else // Not Windows -#ifndef ARROW_ACERO_EXPORT -#define ARROW_ACERO_EXPORT __attribute__((visibility("default"))) -#endif -#ifndef ARROW_ACERO_NO_EXPORT -#define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden"))) -#endif +# ifndef ARROW_ACERO_EXPORT +# define ARROW_ACERO_EXPORT __attribute__((visibility("default"))) +# endif +# ifndef ARROW_ACERO_NO_EXPORT +# define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden"))) +# endif #endif // Not-Windows #if defined(_MSC_VER) -#pragma warning(pop) +# pragma warning(pop) #endif diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 25759f8471365..d16b6cfd2e97d 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -25,7 +25,7 @@ #include #ifdef ARROW_ORC_NEED_TIME_ZONE_DATABASE_CHECK -#include +# include #endif #include "arrow/adapters/orc/util.h" diff --git a/cpp/src/arrow/array/array_base.h b/cpp/src/arrow/array/array_base.h index 716ae0722069e..e4af67d7e5f0b 100644 --- a/cpp/src/arrow/array/array_base.h +++ b/cpp/src/arrow/array/array_base.h @@ -232,6 +232,14 @@ class ARROW_EXPORT Array { /// \return DeviceAllocationType DeviceAllocationType device_type() const { return data_->device_type(); } + /// \brief Return the statistics of this Array + /// + /// This just delegates to calling statistics on the underlying ArrayData + /// object which backs this Array. + /// + /// \return const ArrayStatistics& + std::shared_ptr statistics() const { return data_->statistics; } + protected: Array() = default; ARROW_DEFAULT_MOVE_AND_ASSIGN(Array); diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 32806d9d2edb3..73e0c692432b6 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) { } } +class TestArrayDataStatistics : public ::testing::Test { + public: + void SetUp() { + valids_ = {1, 0, 1, 1}; + null_count_ = std::count(valids_.begin(), valids_.end(), 0); + null_buffer_ = *internal::BytesToBits(valids_); + values_ = {1, 0, 3, -4}; + min_ = *std::min_element(values_.begin(), values_.end()); + max_ = *std::max_element(values_.begin(), values_.end()); + values_buffer_ = Buffer::FromVector(values_); + data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_}, + null_count_); + data_->statistics = std::make_shared(); + data_->statistics->null_count = null_count_; + data_->statistics->min = min_; + data_->statistics->is_min_exact = true; + data_->statistics->max = max_; + data_->statistics->is_max_exact = true; + } + + protected: + std::vector valids_; + size_t null_count_; + std::shared_ptr null_buffer_; + std::vector values_; + int64_t min_; + int64_t max_; + std::shared_ptr values_buffer_; + std::shared_ptr data_; +}; + +TEST_F(TestArrayDataStatistics, MoveConstructor) { + ArrayData copied_data(*data_); + ArrayData moved_data(std::move(copied_data)); + + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); + + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyConstructor) { + ArrayData copied_data(*data_); + + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); + + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, MoveAssignment) { + ArrayData copied_data(*data_); + ArrayData moved_data; + moved_data = std::move(copied_data); + + ASSERT_TRUE(moved_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, moved_data.statistics->null_count.value()); + + ASSERT_TRUE(moved_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(moved_data.statistics->min.value())); + ASSERT_TRUE(moved_data.statistics->is_min_exact); + + ASSERT_TRUE(moved_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(moved_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(moved_data.statistics->max.value())); + ASSERT_TRUE(moved_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyAssignment) { + ArrayData copied_data; + copied_data = *data_; + + ASSERT_TRUE(copied_data.statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data.statistics->null_count.value()); + + ASSERT_TRUE(copied_data.statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data.statistics->min.value())); + ASSERT_TRUE(copied_data.statistics->is_min_exact); + + ASSERT_TRUE(copied_data.statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data.statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data.statistics->max.value())); + ASSERT_TRUE(copied_data.statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, CopyTo) { + ASSERT_OK_AND_ASSIGN(auto copied_data, + data_->CopyTo(arrow::default_cpu_memory_manager())); + + ASSERT_TRUE(copied_data->statistics->null_count.has_value()); + ASSERT_EQ(null_count_, copied_data->statistics->null_count.value()); + + ASSERT_TRUE(copied_data->statistics->min.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->min.value())); + ASSERT_EQ(min_, std::get(copied_data->statistics->min.value())); + ASSERT_TRUE(copied_data->statistics->is_min_exact); + + ASSERT_TRUE(copied_data->statistics->max.has_value()); + ASSERT_TRUE(std::holds_alternative(copied_data->statistics->max.value())); + ASSERT_EQ(max_, std::get(copied_data->statistics->max.value())); + ASSERT_TRUE(copied_data->statistics->is_max_exact); +} + +TEST_F(TestArrayDataStatistics, Slice) { + auto sliced_data = data_->Slice(0, 1); + ASSERT_FALSE(sliced_data->statistics); +} + template class TestPrimitiveArray : public ::testing::Test { public: diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 83eeb56c496cf..8e29297a8c175 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -165,6 +165,8 @@ Result> CopyToImpl(const ArrayData& data, ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn)); } + output->statistics = data.statistics; + return output; } } // namespace @@ -195,6 +197,7 @@ std::shared_ptr ArrayData::Slice(int64_t off, int64_t len) const { } else { copy->null_count = null_count != 0 ? kUnknownNullCount : 0; } + copy->statistics = nullptr; return copy; } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index e0508fe6980a7..1e6ee9a1d32ff 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -24,6 +24,7 @@ #include #include +#include "arrow/array/statistics.h" #include "arrow/buffer.h" #include "arrow/result.h" #include "arrow/type.h" @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(std::move(other.buffers)), child_data(std::move(other.child_data)), - dictionary(std::move(other.dictionary)) { + dictionary(std::move(other.dictionary)), + statistics(std::move(other.statistics)) { SetNullCount(other.null_count); } @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData { offset(other.offset), buffers(other.buffers), child_data(other.child_data), - dictionary(other.dictionary) { + dictionary(other.dictionary), + statistics(other.statistics) { SetNullCount(other.null_count); } @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData { buffers = std::move(other.buffers); child_data = std::move(other.child_data); dictionary = std::move(other.dictionary); + statistics = std::move(other.statistics); return *this; } @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData { buffers = other.buffers; child_data = other.child_data; dictionary = other.dictionary; + statistics = other.statistics; return *this; } @@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData { } /// \brief Construct a zero-copy slice of the data with the given offset and length + /// + /// The associated `ArrayStatistics` is always discarded in a sliced + /// `ArrayData`. Because `ArrayStatistics` in the original + /// `ArrayData` may be invalid in a sliced `ArrayData`. If you want + /// to reuse statistics in the original `ArrayData`, you need to do + /// it by yourself. + /// + /// If the specified slice range has the same range as the original + /// `ArrayData`, we can reuse statistics in the original + /// `ArrayData`. Because it has the same data as the original + /// `ArrayData`. But the associated `ArrayStatistics` is discarded + /// in this case too. Use `Copy()` instead for the case. std::shared_ptr Slice(int64_t offset, int64_t length) const; /// \brief Input-checking variant of Slice @@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData { // The dictionary for this Array, if any. Only used for dictionary type std::shared_ptr dictionary; + + // The statistics for this Array. + std::shared_ptr statistics; }; /// \brief A non-owning Buffer reference diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h index 6abe866b5f6f6..db051fff5ff05 100644 --- a/cpp/src/arrow/c/abi.h +++ b/cpp/src/arrow/c/abi.h @@ -41,11 +41,11 @@ extern "C" { #endif #ifndef ARROW_C_DATA_INTERFACE -#define ARROW_C_DATA_INTERFACE +# define ARROW_C_DATA_INTERFACE -#define ARROW_FLAG_DICTIONARY_ORDERED 1 -#define ARROW_FLAG_NULLABLE 2 -#define ARROW_FLAG_MAP_KEYS_SORTED 4 +# define ARROW_FLAG_DICTIONARY_ORDERED 1 +# define ARROW_FLAG_NULLABLE 2 +# define ARROW_FLAG_MAP_KEYS_SORTED 4 struct ArrowSchema { // Array type description @@ -83,7 +83,7 @@ struct ArrowArray { #endif // ARROW_C_DATA_INTERFACE #ifndef ARROW_C_DEVICE_DATA_INTERFACE -#define ARROW_C_DEVICE_DATA_INTERFACE +# define ARROW_C_DEVICE_DATA_INTERFACE // Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html @@ -91,33 +91,33 @@ struct ArrowArray { typedef int32_t ArrowDeviceType; // CPU device, same as using ArrowArray directly -#define ARROW_DEVICE_CPU 1 +# define ARROW_DEVICE_CPU 1 // CUDA GPU Device -#define ARROW_DEVICE_CUDA 2 +# define ARROW_DEVICE_CUDA 2 // Pinned CUDA CPU memory by cudaMallocHost -#define ARROW_DEVICE_CUDA_HOST 3 +# define ARROW_DEVICE_CUDA_HOST 3 // OpenCL Device -#define ARROW_DEVICE_OPENCL 4 +# define ARROW_DEVICE_OPENCL 4 // Vulkan buffer for next-gen graphics -#define ARROW_DEVICE_VULKAN 7 +# define ARROW_DEVICE_VULKAN 7 // Metal for Apple GPU -#define ARROW_DEVICE_METAL 8 +# define ARROW_DEVICE_METAL 8 // Verilog simulator buffer -#define ARROW_DEVICE_VPI 9 +# define ARROW_DEVICE_VPI 9 // ROCm GPUs for AMD GPUs -#define ARROW_DEVICE_ROCM 10 +# define ARROW_DEVICE_ROCM 10 // Pinned ROCm CPU memory allocated by hipMallocHost -#define ARROW_DEVICE_ROCM_HOST 11 +# define ARROW_DEVICE_ROCM_HOST 11 // Reserved for extension -#define ARROW_DEVICE_EXT_DEV 12 +# define ARROW_DEVICE_EXT_DEV 12 // CUDA managed/unified memory allocated by cudaMallocManaged -#define ARROW_DEVICE_CUDA_MANAGED 13 +# define ARROW_DEVICE_CUDA_MANAGED 13 // unified shared memory allocated on a oneAPI non-partitioned device. -#define ARROW_DEVICE_ONEAPI 14 +# define ARROW_DEVICE_ONEAPI 14 // GPU support for next-gen WebGPU standard -#define ARROW_DEVICE_WEBGPU 15 +# define ARROW_DEVICE_WEBGPU 15 // Qualcomm Hexagon DSP -#define ARROW_DEVICE_HEXAGON 16 +# define ARROW_DEVICE_HEXAGON 16 struct ArrowDeviceArray { // the Allocated Array @@ -138,7 +138,7 @@ struct ArrowDeviceArray { #endif // ARROW_C_DEVICE_DATA_INTERFACE #ifndef ARROW_C_STREAM_INTERFACE -#define ARROW_C_STREAM_INTERFACE +# define ARROW_C_STREAM_INTERFACE struct ArrowArrayStream { // Callback to get the stream type @@ -179,7 +179,7 @@ struct ArrowArrayStream { #endif // ARROW_C_STREAM_INTERFACE #ifndef ARROW_C_DEVICE_STREAM_INTERFACE -#define ARROW_C_DEVICE_STREAM_INTERFACE +# define ARROW_C_DEVICE_STREAM_INTERFACE // Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. // diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 09bb524adbdf0..01fd56f631d99 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -48,7 +48,7 @@ // TODO(GH-37221): Remove these ifdef checks when compute dependency is removed #ifdef ARROW_COMPUTE -#include "arrow/compute/api_vector.h" +# include "arrow/compute/api_vector.h" #endif namespace arrow { diff --git a/cpp/src/arrow/c/dlpack_abi.h b/cpp/src/arrow/c/dlpack_abi.h index 4af557a7ed5d7..fbe2a56a344b3 100644 --- a/cpp/src/arrow/c/dlpack_abi.h +++ b/cpp/src/arrow/c/dlpack_abi.h @@ -12,9 +12,9 @@ * \brief Compatibility with C++ */ #ifdef __cplusplus -#define DLPACK_EXTERN_C extern "C" +# define DLPACK_EXTERN_C extern "C" #else -#define DLPACK_EXTERN_C +# define DLPACK_EXTERN_C #endif /*! \brief The current major version of dlpack */ @@ -25,13 +25,13 @@ /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 -#ifdef DLPACK_EXPORTS -#define DLPACK_DLL __declspec(dllexport) +# ifdef DLPACK_EXPORTS +# define DLPACK_DLL __declspec(dllexport) +# else +# define DLPACK_DLL __declspec(dllimport) +# endif #else -#define DLPACK_DLL __declspec(dllimport) -#endif -#else -#define DLPACK_DLL +# define DLPACK_DLL #endif #include diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 55eec53ced1c7..854127480744e 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -60,42 +60,38 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { template void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets, int64_t n_indices, const IndexType* logical_index_vec, - IndexType* out_chunk_index_vec, IndexType chunk_hint, - IndexType* out_index_in_chunk_vec) { + TypedChunkLocation* out_chunk_location_vec, + IndexType chunk_hint) { auto* offsets = reinterpret_cast(signed_offsets); const auto num_chunks = static_cast(num_offsets - 1); // chunk_hint in [0, num_offsets) per the precondition. for (int64_t i = 0; i < n_indices; i++) { - const auto index = static_cast(logical_index_vec[i]); + auto typed_logical_index = logical_index_vec[i]; + const auto index = static_cast(typed_logical_index); + // use or update chunk_hint if (index >= offsets[chunk_hint] && (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) { - out_chunk_index_vec[i] = chunk_hint; // hint is correct! - continue; + // hint is correct! + } else { + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + auto chunk_index = + ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + chunk_hint = static_cast(chunk_index); } - // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` - auto chunk_index = - ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); - chunk_hint = static_cast(chunk_index); - out_chunk_index_vec[i] = chunk_hint; - } - if (out_index_in_chunk_vec != NULLPTR) { - for (int64_t i = 0; i < n_indices; i++) { - auto logical_index = logical_index_vec[i]; - auto chunk_index = out_chunk_index_vec[i]; - // chunk_index is in [0, chunks.size()] no matter what the - // value of logical_index is, so it's always safe to dereference - // offset_ as it contains chunks.size()+1 values. - out_index_in_chunk_vec[i] = - logical_index - static_cast(offsets[chunk_index]); + out_chunk_location_vec[i].chunk_index = chunk_hint; + // chunk_index is in [0, chunks.size()] no matter what the + // value of logical_index is, so it's always safe to dereference + // offset_ as it contains chunks.size()+1 values. + out_chunk_location_vec[i].index_in_chunk = + typed_logical_index - static_cast(offsets[chunk_hint]); #if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) - // Make it more likely that Valgrind/ASAN can catch an invalid memory - // access by poisoning out_index_in_chunk_vec[i] when the logical - // index is out-of-bounds. - if (chunk_index == num_chunks) { - out_index_in_chunk_vec[i] = std::numeric_limits::max(); - } -#endif + // Make it more likely that Valgrind/ASAN can catch an invalid memory + // access by poisoning the index-in-chunk value when the logical + // index is out-of-bounds. + if (chunk_hint == num_chunks) { + out_chunk_location_vec[i].index_in_chunk = std::numeric_limits::max(); } +#endif } } @@ -130,31 +126,31 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept { } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec, - uint8_t* out_chunk_index_vec, uint8_t chunk_hint, - uint8_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint8_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec, - uint32_t* out_chunk_index_vec, uint32_t chunk_hint, - uint32_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint32_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec, - uint16_t* out_chunk_index_vec, uint16_t chunk_hint, - uint16_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint16_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec, - uint64_t* out_chunk_index_vec, uint64_t chunk_hint, - uint64_t* out_index_in_chunk_vec) const { + TypedChunkLocation* out_chunk_location_vec, + uint64_t chunk_hint) const { ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, - out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); + out_chunk_location_vec, chunk_hint); } } // namespace arrow::internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index a2a3d5a864243..83fda62387fe1 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -31,28 +31,34 @@ namespace arrow::internal { struct ChunkResolver; -struct ChunkLocation { +template +struct TypedChunkLocation { /// \brief Index of the chunk in the array of chunks /// /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used /// to represent out-of-bounds locations. - int64_t chunk_index = 0; + IndexType chunk_index = 0; /// \brief Index of the value in the chunk /// /// The value is UNDEFINED if chunk_index >= chunks.size() - int64_t index_in_chunk = 0; + IndexType index_in_chunk = 0; - ChunkLocation() = default; + TypedChunkLocation() = default; - ChunkLocation(int64_t chunk_index, int64_t index_in_chunk) - : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {} + TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) { + static_assert(sizeof(TypedChunkLocation) == 2 * sizeof(IndexType)); + static_assert(alignof(TypedChunkLocation) == alignof(IndexType)); + } - bool operator==(ChunkLocation other) const { + bool operator==(TypedChunkLocation other) const { return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; } }; +using ChunkLocation = TypedChunkLocation; + /// \brief An utility that incrementally resolves logical indices into /// physical indices in a chunked array. struct ARROW_EXPORT ChunkResolver { @@ -144,26 +150,25 @@ struct ARROW_EXPORT ChunkResolver { /// /// \pre 0 <= logical_index_vec[i] < logical_array_length() /// (for well-defined and valid chunk index results) - /// \pre out_chunk_index_vec has space for `n_indices` + /// \pre out_chunk_location_vec has space for `n_indices` locations /// \pre chunk_hint in [0, chunks.size()] - /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) + /// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n) /// \post if logical_index_vec[i] >= chunked_array.length(), then - /// out_chunk_index_vec[i] == chunks.size() - /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) - /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and - /// out_index_in_chunk_vec[i] are UNDEFINED + /// out_chunk_location_vec[i].chunk_index == chunks.size() + /// and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be + /// out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i] + /// are UNDEFINED /// /// \param n_indices The number of logical indices to resolve /// \param logical_index_vec The logical indices to resolve - /// \param out_chunk_index_vec The output array where the chunk indices will be written + /// \param out_chunk_location_vec The output array where the locations will be written /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany - /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the - /// within-chunk indices will be written /// \return false iff chunks.size() > std::numeric_limits::max() template [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, - IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, - IndexType* out_index_in_chunk_vec = NULLPTR) const { + TypedChunkLocation* out_chunk_location_vec, + IndexType chunk_hint = 0) const { if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); @@ -188,13 +193,11 @@ struct ARROW_EXPORT ChunkResolver { // logical index in the chunked array. using U = std::make_unsigned_t; ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), - reinterpret_cast(out_chunk_index_vec), - static_cast(chunk_hint), - reinterpret_cast(out_index_in_chunk_vec)); + reinterpret_cast*>(out_chunk_location_vec), + static_cast(chunk_hint)); } else { static_assert(std::is_unsigned_v); - ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, - out_index_in_chunk_vec); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec, chunk_hint); } return true; } @@ -226,10 +229,14 @@ struct ARROW_EXPORT ChunkResolver { /// \pre all the pre-conditions of ChunkResolver::ResolveMany() /// \pre num_offsets - 1 <= std::numeric_limits::max() - void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const; - void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const; - void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const; - void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const; + void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation*, + uint8_t) const; + void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation*, + uint16_t) const; + void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation*, + uint32_t) const; + void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation*, + uint64_t) const; public: /// \brief Find the index of the chunk that contains the logical index. diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index b796e9250008a..bf9d4af7c7bb0 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -37,6 +37,7 @@ namespace arrow { using internal::ChunkLocation; using internal::ChunkResolver; +using internal::TypedChunkLocation; class TestChunkedArray : public ::testing::Test { protected: @@ -380,24 +381,26 @@ class TestChunkResolverMany : public ::testing::Test { Result> ResolveMany( const ChunkResolver& resolver, const std::vector& logical_index_vec) { const size_t n = logical_index_vec.size(); - std::vector chunk_index_vec; - chunk_index_vec.resize(n); - std::vector index_in_chunk_vec; - index_in_chunk_vec.resize(n); + std::vector> chunk_location_vec; + chunk_location_vec.resize(n); bool valid = resolver.ResolveMany( - static_cast(n), logical_index_vec.data(), chunk_index_vec.data(), 0, - index_in_chunk_vec.data()); + static_cast(n), logical_index_vec.data(), chunk_location_vec.data(), 0); if (ARROW_PREDICT_FALSE(!valid)) { return Status::Invalid("index type doesn't fit possible chunk indexes"); } - std::vector locations; - locations.reserve(n); - for (size_t i = 0; i < n; i++) { - auto chunk_index = static_cast(chunk_index_vec[i]); - auto index_in_chunk = static_cast(index_in_chunk_vec[i]); - locations.emplace_back(chunk_index, index_in_chunk); + if constexpr (std::is_same::value) { + return chunk_location_vec; + } else { + std::vector locations; + locations.reserve(n); + for (size_t i = 0; i < n; i++) { + auto loc = chunk_location_vec[i]; + auto chunk_index = static_cast(loc.chunk_index); + auto index_in_chunk = static_cast(loc.index_in_chunk); + locations.emplace_back(chunk_index, index_in_chunk); + } + return locations; } - return locations; } void CheckResolveMany(const ChunkResolver& resolver, diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index cfa1cd8193f36..cfb6265f12904 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -42,7 +42,7 @@ // macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h. // No other BSD seems to do so. The name is used as an identifier in MemAllocation enum. #if defined(__APPLE__) && defined(PREALLOCATE) -#undef PREALLOCATE +# undef PREALLOCATE #endif namespace arrow { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 1fbcd6a249093..b545d8bcc1003 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -23,7 +23,9 @@ #include "arrow/util/cpu_info.h" #include "arrow/util/hashing.h" -#include +// Include templated definitions for aggregate kernels that must compiled here +// with the SIMD level configured for this compilation unit in the build. +#include "arrow/compute/kernels/aggregate_basic.inc.cc" // NOLINT(build/include) namespace arrow { namespace compute { @@ -276,11 +278,6 @@ struct SumImplDefault : public SumImpl { using SumImpl::SumImpl; }; -template -struct MeanImplDefault : public MeanImpl { - using MeanImpl::MeanImpl; -}; - Result> SumInit(KernelContext* ctx, const KernelInitArgs& args) { SumLikeInit visitor( @@ -289,6 +286,14 @@ Result> SumInit(KernelContext* ctx, return visitor.Create(); } +// ---------------------------------------------------------------------- +// Mean implementation + +template +struct MeanImplDefault : public MeanImpl { + using MeanImpl::MeanImpl; +}; + Result> MeanInit(KernelContext* ctx, const KernelInitArgs& args) { MeanKernelInit visitor( @@ -482,8 +487,8 @@ void AddFirstOrLastAggKernel(ScalarAggregateFunction* func, // ---------------------------------------------------------------------- // MinMax implementation -Result> MinMaxInit(KernelContext* ctx, - const KernelInitArgs& args) { +Result> MinMaxInitDefault(KernelContext* ctx, + const KernelInitArgs& args) { ARROW_ASSIGN_OR_RAISE(TypeHolder out_type, args.kernel->signature->out_type().Resolve(ctx, args.inputs)); MinMaxInitState visitor( @@ -532,13 +537,13 @@ struct BooleanAnyImpl : public ScalarAggregator { } if (batch[0].is_scalar()) { const Scalar& scalar = *batch[0].scalar; - this->has_nulls = !scalar.is_valid; - this->any = scalar.is_valid && checked_cast(scalar).value; - this->count += scalar.is_valid; + this->has_nulls |= !scalar.is_valid; + this->any |= scalar.is_valid && checked_cast(scalar).value; + this->count += scalar.is_valid * batch.length; return Status::OK(); } const ArraySpan& data = batch[0].array; - this->has_nulls = data.GetNullCount() > 0; + this->has_nulls |= data.GetNullCount() > 0; this->count += data.length - data.GetNullCount(); arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[0].data, data.offset, data.buffers[1].data, data.offset, @@ -603,13 +608,13 @@ struct BooleanAllImpl : public ScalarAggregator { } if (batch[0].is_scalar()) { const Scalar& scalar = *batch[0].scalar; - this->has_nulls = !scalar.is_valid; - this->count += scalar.is_valid; - this->all = !scalar.is_valid || checked_cast(scalar).value; + this->has_nulls |= !scalar.is_valid; + this->count += scalar.is_valid * batch.length; + this->all &= !scalar.is_valid || checked_cast(scalar).value; return Status::OK(); } const ArraySpan& data = batch[0].array; - this->has_nulls = data.GetNullCount() > 0; + this->has_nulls |= data.GetNullCount() > 0; this->count += data.length - data.GetNullCount(); arrow::internal::OptionalBinaryBitBlockCounter counter( data.buffers[1].data, data.offset, data.buffers[0].data, data.offset, @@ -1114,14 +1119,14 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { // Add min max function func = std::make_shared("min_max", Arity::Unary(), min_max_doc, &default_scalar_aggregate_options); - AddMinMaxKernels(MinMaxInit, {null(), boolean()}, func.get()); - AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); - AddMinMaxKernels(MinMaxInit, TemporalTypes(), func.get()); - AddMinMaxKernels(MinMaxInit, BaseBinaryTypes(), func.get()); - AddMinMaxKernel(MinMaxInit, Type::FIXED_SIZE_BINARY, func.get()); - AddMinMaxKernel(MinMaxInit, Type::INTERVAL_MONTHS, func.get()); - AddMinMaxKernel(MinMaxInit, Type::DECIMAL128, func.get()); - AddMinMaxKernel(MinMaxInit, Type::DECIMAL256, func.get()); + AddMinMaxKernels(MinMaxInitDefault, {null(), boolean()}, func.get()); + AddMinMaxKernels(MinMaxInitDefault, NumericTypes(), func.get()); + AddMinMaxKernels(MinMaxInitDefault, TemporalTypes(), func.get()); + AddMinMaxKernels(MinMaxInitDefault, BaseBinaryTypes(), func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::FIXED_SIZE_BINARY, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::INTERVAL_MONTHS, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL128, func.get()); + AddMinMaxKernel(MinMaxInitDefault, Type::DECIMAL256, func.get()); // Add the SIMD variants for min max #if defined(ARROW_HAVE_RUNTIME_AVX2) if (cpu_info->IsSupported(arrow::internal::CpuInfo::AVX2)) { diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc new file mode 100644 index 0000000000000..f2151e0a9e029 --- /dev/null +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.inc.cc @@ -0,0 +1,1025 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// .inc.cc file to be included in compilation unit where kernels are meant to be +// compiled auto-vectorized by the compiler with different SIMD levels passed +// as compiler flags. +// +// It contains no includes to avoid double inclusion in the compilation unit +// that includes this .inc.cc file. + +#include +#include +#include +#include +#include + +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/kernels/aggregate_internal.h" +#include "arrow/compute/kernels/codegen_internal.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/align_util.h" +#include "arrow/util/bit_block_counter.h" +#include "arrow/util/decimal.h" + +namespace arrow::compute::internal { +namespace { + +// ---------------------------------------------------------------------- +// Sum implementation + +template ::Type> +struct SumImpl : public ScalarAggregator { + using ThisType = SumImpl; + using CType = typename TypeTraits::CType; + using SumType = ResultType; + using SumCType = typename TypeTraits::CType; + using OutputType = typename TypeTraits::ScalarType; + + SumImpl(std::shared_ptr out_type, ScalarAggregateOptions options_) + : out_type(std::move(out_type)), options(std::move(options_)) {} + + Status Consume(KernelContext*, const ExecSpan& batch) override { + if (batch[0].is_array()) { + const ArraySpan& data = batch[0].array; + this->count += data.length - data.GetNullCount(); + this->nulls_observed = this->nulls_observed || data.GetNullCount(); + + if (!options.skip_nulls && this->nulls_observed) { + // Short-circuit + return Status::OK(); + } + + if (is_boolean_type::value) { + this->sum += GetTrueCount(data); + } else { + this->sum += SumArray(data); + } + } else { + const Scalar& data = *batch[0].scalar; + this->count += data.is_valid * batch.length; + this->nulls_observed = this->nulls_observed || !data.is_valid; + if (data.is_valid) { + this->sum += internal::UnboxScalar::Unbox(data) * batch.length; + } + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->count += other.count; + this->sum += other.sum; + this->nulls_observed = this->nulls_observed || other.nulls_observed; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + if ((!options.skip_nulls && this->nulls_observed) || + (this->count < options.min_count)) { + out->value = std::make_shared(out_type); + } else { + out->value = std::make_shared(this->sum, out_type); + } + return Status::OK(); + } + + size_t count = 0; + bool nulls_observed = false; + SumCType sum = 0; + std::shared_ptr out_type; + ScalarAggregateOptions options; +}; + +template +struct NullImpl : public ScalarAggregator { + using ScalarType = typename TypeTraits::ScalarType; + + explicit NullImpl(const ScalarAggregateOptions& options_) : options(options_) {} + + Status Consume(KernelContext*, const ExecSpan& batch) override { + if (batch[0].is_scalar() || batch[0].array.GetNullCount() > 0) { + // If the batch is a scalar or an array with elements, set is_empty to false + is_empty = false; + } + return Status::OK(); + } + + Status MergeFrom(KernelContext*, KernelState&& src) override { + const auto& other = checked_cast(src); + this->is_empty &= other.is_empty; + return Status::OK(); + } + + Status Finalize(KernelContext*, Datum* out) override { + if ((options.skip_nulls || this->is_empty) && options.min_count == 0) { + // Return 0 if the remaining data is empty + out->value = output_empty(); + } else { + out->value = MakeNullScalar(TypeTraits::type_singleton()); + } + return Status::OK(); + } + + virtual std::shared_ptr output_empty() = 0; + + bool is_empty = true; + ScalarAggregateOptions options; +}; + +template +struct NullSumImpl : public NullImpl { + using ScalarType = typename TypeTraits::ScalarType; + + explicit NullSumImpl(const ScalarAggregateOptions& options_) + : NullImpl(options_) {} + + std::shared_ptr output_empty() override { + return std::make_shared(0); + } +}; + +template