GH-45045: [C++][Parquet] Add a benchmark for size_statistics_level (#… #778
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Licensed to the Apache Software Foundation (ASF) under one | |
# or more contributor license agreements. See the NOTICE file | |
# distributed with this work for additional information | |
# regarding copyright ownership. The ASF licenses this file | |
# to you under the Apache License, Version 2.0 (the | |
# "License"); you may not use this file except in compliance | |
# with the License. You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, | |
# software distributed under the License is distributed on an | |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
# KIND, either express or implied. See the License for the | |
# specific language governing permissions and limitations | |
# under the License. | |
name: C++ | |
on: | |
push: | |
branches: | |
- '**' | |
- '!dependabot/**' | |
tags: | |
- '**' | |
paths: | |
- '.dockerignore' | |
- '.github/workflows/cpp.yml' | |
- 'ci/conda_env_*' | |
- 'ci/docker/**' | |
- 'ci/scripts/cpp_*' | |
- 'ci/scripts/install_azurite.sh' | |
- 'ci/scripts/install_gcs_testbench.sh' | |
- 'ci/scripts/install_minio.sh' | |
- 'ci/scripts/msys2_*' | |
- 'ci/scripts/util_*' | |
- 'cpp/**' | |
- 'docker-compose.yml' | |
- 'format/Flight.proto' | |
- 'testing' | |
pull_request: | |
paths: | |
- '.dockerignore' | |
- '.github/workflows/cpp.yml' | |
- 'ci/conda_env_*' | |
- 'ci/docker/**' | |
- 'ci/scripts/cpp_*' | |
- 'ci/scripts/install_azurite.sh' | |
- 'ci/scripts/install_gcs_testbench.sh' | |
- 'ci/scripts/install_minio.sh' | |
- 'ci/scripts/msys2_*' | |
- 'ci/scripts/util_*' | |
- 'cpp/**' | |
- 'docker-compose.yml' | |
- 'format/Flight.proto' | |
- 'testing' | |
concurrency: | |
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} | |
cancel-in-progress: true | |
permissions: | |
contents: read | |
env: | |
ARCHERY_DEBUG: 1 | |
ARROW_ENABLE_TIMING_TESTS: OFF | |
DOCKER_VOLUME_PREFIX: ".docker/" | |
jobs: | |
docker-targets: | |
name: Docker targets | |
runs-on: ubuntu-latest | |
outputs: | |
targets: ${{ steps.detect-targets.outputs.targets }} | |
steps: | |
- name: Detect targets | |
id: detect-targets | |
run: | | |
echo "targets<<JSON" >> "$GITHUB_OUTPUT" | |
echo "[" >> "$GITHUB_OUTPUT" | |
cat <<JSON >> "$GITHUB_OUTPUT" | |
{ | |
"arch": "amd64", | |
"clang-tools": "14", | |
"image": "conda-cpp", | |
"llvm": "14", | |
"runs-on": "ubuntu-latest", | |
"simd-level": "AVX2", | |
"title": "AMD64 Conda C++ AVX2", | |
"ubuntu": "22.04" | |
}, | |
{ | |
"arch": "amd64", | |
"clang-tools": "14", | |
"image": "ubuntu-cpp-sanitizer", | |
"llvm": "14", | |
"runs-on": "ubuntu-latest", | |
"title": "AMD64 Ubuntu 22.04 C++ ASAN UBSAN", | |
"ubuntu": "22.04" | |
} | |
JSON | |
if [ "$GITHUB_REPOSITORY_OWNER" = "apache" ]; then | |
echo "," >> "$GITHUB_OUTPUT" | |
cat <<JSON >> "$GITHUB_OUTPUT" | |
{ | |
"arch": "arm64v8", | |
"clang-tools": "10", | |
"image": "ubuntu-cpp", | |
"llvm": "10", | |
"runs-on": ["self-hosted", "arm", "linux"], | |
"title": "ARM64 Ubuntu 20.04 C++", | |
"ubuntu": "20.04" | |
} | |
JSON | |
fi | |
echo "]" >> "$GITHUB_OUTPUT" | |
echo "JSON" >> "$GITHUB_OUTPUT" | |
docker: | |
name: ${{ matrix.title }} | |
needs: docker-targets | |
runs-on: ${{ matrix.runs-on }} | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 75 | |
strategy: | |
fail-fast: false | |
matrix: | |
include: ${{ fromJson(needs.docker-targets.outputs.targets) }} | |
env: | |
ARCH: ${{ matrix.arch }} | |
ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} | |
CLANG_TOOLS: ${{ matrix.clang-tools }} | |
LLVM: ${{ matrix.llvm }} | |
UBUNTU: ${{ matrix.ubuntu }} | |
steps: | |
- name: Checkout Arrow | |
uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Cache Docker Volumes | |
uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 | |
with: | |
path: .docker | |
key: ${{ matrix.image }}-${{ hashFiles('cpp/**') }} | |
restore-keys: ${{ matrix.image }}- | |
- name: Setup Python on hosted runner | |
if: | | |
matrix.runs-on == 'ubuntu-latest' | |
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 | |
with: | |
python-version: 3 | |
- name: Setup Python on self-hosted runner | |
if: | | |
contains(matrix.runs-on, 'self-hosted') | |
run: | | |
sudo apt update | |
sudo apt install -y --no-install-recommends python3 python3-dev python3-pip | |
python3 -m pip install -U pip | |
- name: Setup Archery | |
run: python3 -m pip install -e dev/archery[docker] | |
- name: Execute Docker Build | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
run: | | |
# GH-40558: reduce ASLR to avoid ASAN/LSAN crashes | |
sudo sysctl -w vm.mmap_rnd_bits=28 | |
source ci/scripts/util_enable_core_dumps.sh | |
archery docker run ${{ matrix.image }} | |
- name: Docker Push | |
if: >- | |
success() && | |
github.event_name == 'push' && | |
github.repository == 'apache/arrow' && | |
github.ref_name == 'main' | |
env: | |
ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} | |
ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} | |
continue-on-error: true | |
run: archery docker push ${{ matrix.image }} | |
build-example: | |
name: C++ Minimal Build Example | |
runs-on: ubuntu-latest | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 45 | |
steps: | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Check CMake presets | |
run: | | |
cd cpp | |
cmake --list-presets | |
- name: Run minimal example | |
run: | | |
cd cpp/examples/minimal_build | |
docker compose run --rm minimal | |
macos: | |
name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} C++ | |
runs-on: macos-${{ matrix.macos-version }} | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 75 | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- architecture: AMD64 | |
macos-version: "13" | |
- architecture: ARM64 | |
macos-version: "14" | |
env: | |
ARROW_AZURE: ON | |
ARROW_BUILD_TESTS: ON | |
ARROW_DATASET: ON | |
ARROW_FLIGHT: ON | |
ARROW_GANDIVA: ON | |
ARROW_GCS: ON | |
ARROW_HDFS: ON | |
ARROW_HOME: /tmp/local | |
ARROW_JEMALLOC: ON | |
ARROW_ORC: ON | |
ARROW_PARQUET: ON | |
ARROW_S3: ON | |
ARROW_SUBSTRAIT: ON | |
ARROW_WITH_BROTLI: ON | |
ARROW_WITH_BZ2: ON | |
ARROW_WITH_LZ4: ON | |
# GH-36013 disabling opentelemetry here because we can't | |
# get the patched version from conda | |
# ARROW_WITH_OPENTELEMETRY: ON | |
ARROW_WITH_SNAPPY: ON | |
ARROW_WITH_ZLIB: ON | |
ARROW_WITH_ZSTD: ON | |
GTest_SOURCE: BUNDLED | |
steps: | |
- name: CPU Info | |
run: | | |
sysctl -a | grep cpu | |
sysctl -a | grep "hw.optional" | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Install Dependencies | |
run: | | |
# pkg-config formula is deprecated but it's still installed | |
# in GitHub Actions runner now. We can remove this once | |
# pkg-config formula is removed from GitHub Actions runner. | |
brew uninstall pkg-config || : | |
brew uninstall [email protected] || : | |
brew bundle --file=cpp/Brewfile | |
- name: Install MinIO | |
run: | | |
$(brew --prefix bash)/bin/bash \ | |
ci/scripts/install_minio.sh latest ${ARROW_HOME} | |
- name: Set up Python | |
uses: actions/[email protected] | |
with: | |
python-version: 3.12 | |
- name: Install Google Cloud Storage Testbench | |
run: ci/scripts/install_gcs_testbench.sh default | |
- name: Install Azurite Storage Emulator | |
run: ci/scripts/install_azurite.sh | |
- name: Setup ccache | |
run: | | |
ci/scripts/ccache_setup.sh | |
- name: ccache info | |
id: ccache-info | |
run: | | |
echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT | |
- name: Cache ccache | |
uses: actions/cache@v4 | |
with: | |
path: ${{ steps.ccache-info.outputs.cache-dir }} | |
key: cpp-ccache-macos-${{ matrix.macos-version }}-${{ hashFiles('cpp/**') }} | |
restore-keys: cpp-ccache-macos-${{ matrix.macos-version }}- | |
- name: Build | |
run: | | |
ci/scripts/cpp_build.sh $(pwd) $(pwd)/build | |
- name: Test | |
shell: bash | |
run: | | |
sudo sysctl -w kern.coredump=1 | |
sudo sysctl -w kern.corefile=/tmp/core.%N.%P | |
ulimit -c unlimited # must enable within the same shell | |
ci/scripts/cpp_test.sh $(pwd) $(pwd)/build | |
windows: | |
name: ${{ matrix.title }} | |
runs-on: ${{ matrix.os }} | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
timeout-minutes: 60 | |
strategy: | |
fail-fast: false | |
matrix: | |
os: | |
- windows-2019 | |
include: | |
- os: windows-2019 | |
simd-level: AVX2 | |
title: AMD64 Windows 2019 C++17 AVX2 | |
env: | |
ARROW_BOOST_USE_SHARED: OFF | |
ARROW_BUILD_BENCHMARKS: ON | |
ARROW_BUILD_SHARED: ON | |
ARROW_BUILD_STATIC: OFF | |
ARROW_BUILD_TESTS: ON | |
ARROW_DATASET: ON | |
ARROW_FLIGHT: OFF | |
ARROW_HDFS: ON | |
ARROW_HOME: /usr | |
ARROW_JEMALLOC: OFF | |
ARROW_MIMALLOC: ON | |
ARROW_ORC: ON | |
ARROW_PARQUET: ON | |
ARROW_SIMD_LEVEL: ${{ matrix.simd-level }} | |
ARROW_SUBSTRAIT: ON | |
ARROW_USE_GLOG: OFF | |
ARROW_VERBOSE_THIRDPARTY_BUILD: OFF | |
ARROW_WITH_BROTLI: OFF | |
ARROW_WITH_BZ2: OFF | |
ARROW_WITH_LZ4: OFF | |
ARROW_WITH_OPENTELEMETRY: OFF | |
ARROW_WITH_SNAPPY: ON | |
ARROW_WITH_ZLIB: ON | |
ARROW_WITH_ZSTD: ON | |
BOOST_SOURCE: BUNDLED | |
CMAKE_CXX_STANDARD: "17" | |
CMAKE_GENERATOR: Ninja | |
CMAKE_INSTALL_LIBDIR: bin | |
CMAKE_INSTALL_PREFIX: /usr | |
CMAKE_UNITY_BUILD: ON | |
steps: | |
- name: Disable Crash Dialogs | |
run: | | |
reg add ` | |
"HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` | |
/v DontShowUI ` | |
/t REG_DWORD ` | |
/d 1 ` | |
/f | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- name: Download Timezone Database | |
shell: bash | |
run: ci/scripts/download_tz_database.sh | |
- name: Install ccache | |
shell: bash | |
run: | | |
ci/scripts/install_ccache.sh 4.6.3 /usr | |
- name: Setup ccache | |
shell: bash | |
run: | | |
ci/scripts/ccache_setup.sh | |
- name: ccache info | |
id: ccache-info | |
shell: bash | |
run: | | |
echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT | |
- name: Cache ccache | |
uses: actions/cache@v4 | |
with: | |
path: ${{ steps.ccache-info.outputs.cache-dir }} | |
key: cpp-ccache-windows-${{ env.CACHE_VERSION }}-${{ hashFiles('cpp/**') }} | |
restore-keys: cpp-ccache-windows-${{ env.CACHE_VERSION }}- | |
env: | |
# We can invalidate the current cache by updating this. | |
CACHE_VERSION: "2022-09-13" | |
- name: Build | |
shell: cmd | |
run: | | |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64 | |
bash -c "ci/scripts/cpp_build.sh $(pwd) $(pwd)/build" | |
- name: Test | |
shell: bash | |
run: | | |
# For ORC | |
export TZDIR=/c/msys64/usr/share/zoneinfo | |
ci/scripts/cpp_test.sh $(pwd) $(pwd)/build | |
windows-mingw: | |
name: AMD64 Windows MinGW ${{ matrix.msystem_upper }} C++ | |
runs-on: windows-2019 | |
if: ${{ !contains(github.event.pull_request.title, 'WIP') }} | |
# Build may take 1h+ without cache. | |
timeout-minutes: 120 | |
strategy: | |
fail-fast: false | |
matrix: | |
include: | |
- msystem_lower: mingw64 | |
msystem_upper: MINGW64 | |
- msystem_lower: clang64 | |
msystem_upper: CLANG64 | |
env: | |
ARROW_BUILD_SHARED: ON | |
ARROW_BUILD_STATIC: OFF | |
ARROW_BUILD_TESTS: ON | |
ARROW_BUILD_TYPE: release | |
ARROW_DATASET: ON | |
ARROW_FLIGHT: ON | |
ARROW_FLIGHT_SQL: ON | |
ARROW_GANDIVA: ON | |
ARROW_GCS: ON | |
ARROW_HDFS: OFF | |
ARROW_HOME: /${{ matrix.msystem_lower}} | |
ARROW_JEMALLOC: OFF | |
ARROW_PARQUET: ON | |
ARROW_S3: ON | |
ARROW_SUBSTRAIT: ON | |
ARROW_USE_GLOG: OFF | |
ARROW_VERBOSE_THIRDPARTY_BUILD: OFF | |
ARROW_WITH_BROTLI: ON | |
ARROW_WITH_BZ2: ON | |
ARROW_WITH_LZ4: ON | |
ARROW_WITH_OPENTELEMETRY: OFF | |
ARROW_WITH_SNAPPY: ON | |
ARROW_WITH_ZLIB: ON | |
ARROW_WITH_ZSTD: ON | |
# Don't use preinstalled Boost by empty BOOST_ROOT | |
BOOST_ROOT: "" | |
ARROW_CMAKE_ARGS: >- | |
-DARROW_PACKAGE_PREFIX=/${{ matrix.msystem_lower}} | |
-DCMAKE_FIND_PACKAGE_PREFER_CONFIG=ON | |
# We can't use unity build because we don't have enough memory on | |
# GitHub Actions. | |
# CMAKE_UNITY_BUILD: ON | |
GTest_SOURCE: BUNDLED | |
steps: | |
- name: Disable Crash Dialogs | |
run: | | |
reg add ` | |
"HKCU\SOFTWARE\Microsoft\Windows\Windows Error Reporting" ` | |
/v DontShowUI ` | |
/t REG_DWORD ` | |
/d 1 ` | |
/f | |
- name: Checkout Arrow | |
uses: actions/checkout@v4 | |
with: | |
fetch-depth: 0 | |
submodules: recursive | |
- uses: msys2/setup-msys2@v2 | |
with: | |
msystem: ${{ matrix.msystem_upper }} | |
update: true | |
- name: Setup MSYS2 | |
shell: msys2 {0} | |
run: ci/scripts/msys2_setup.sh cpp | |
- name: Cache ccache | |
uses: actions/cache@v4 | |
with: | |
path: ccache | |
key: cpp-ccache-${{ matrix.msystem_lower}}-${{ hashFiles('cpp/**') }} | |
restore-keys: cpp-ccache-${{ matrix.msystem_lower}}- | |
- name: Build | |
shell: msys2 {0} | |
run: | | |
export CMAKE_BUILD_PARALLEL_LEVEL=$NUMBER_OF_PROCESSORS | |
ci/scripts/cpp_build.sh "$(pwd)" "$(pwd)/build" | |
- name: Download Timezone Database | |
shell: bash | |
run: ci/scripts/download_tz_database.sh | |
- name: Download MinIO | |
shell: msys2 {0} | |
run: | | |
mkdir -p /usr/local/bin | |
wget \ | |
--output-document /usr/local/bin/minio.exe \ | |
https://dl.min.io/server/minio/release/windows-amd64/archive/minio.RELEASE.2024-09-13T20-26-02Z | |
chmod +x /usr/local/bin/minio.exe | |
- name: Set up Python | |
uses: actions/[email protected] | |
id: python-install | |
with: | |
python-version: 3.9 | |
- name: Install Google Cloud Storage Testbench | |
shell: msys2 {0} | |
env: | |
PIPX_BIN_DIR: /usr/local/bin | |
PIPX_BASE_PYTHON: ${{ steps.python-install.outputs.python-path }} | |
run: | | |
ci/scripts/install_gcs_testbench.sh default | |
- name: Test | |
shell: msys2 {0} | |
run: | | |
ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build" |